In [1]:
from keras.models import load_model
import pandas as pd
import numpy as np

import os
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

from pathlib import Path

from data.constants import DEEPDTA_MODEL_PATH, LIGAND_PREDICTION_REPORT_PATH

In [2]:
smiles_dict = {
    '': 0, '#': 1, '%': 2, ')': 3, '(': 4,
    '+': 5, '-': 6, '.': 7, '0': 8, '1': 9,
    '2': 10, '3': 11, '4': 12, '5': 13, '6': 14,
    '7': 15, '8': 16, '9': 17, '=': 18, '@': 19,
    'A': 20, 'B': 21, 'C': 22, 'F': 23, 'H': 24,
    'I': 25, 'N': 26, 'O': 27, 'P': 28, 'S': 29,
    '[': 30, '\\': 31, ']': 32, '_': 33, 'a': 34,
    'c': 35, 'e': 36, 'g': 37, 'i': 38, 'l': 39,
    'n': 40, 'o': 41, 'r': 42, 's': 43, 't': 44,
    'u': 45, '|': 46
}


protein_dict = {
    '': 0, 'A': 1, 'C': 2, 'D': 3, 'E': 4,
    'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9,
    'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14,
    'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19,
    'Y': 20
}

In [3]:
def cindex_score(y_true, y_pred):
    # Реализация Concordance Index (CI)
    from lifelines.utils import concordance_index
    return concordance_index(y_true, y_pred)

model = load_model(
    DEEPDTA_MODEL_PATH,
    custom_objects={'cindex_score': cindex_score}
)

In [4]:

# Функции кодирования
def encode_smiles(smiles, max_len=100):
    encoded = np.zeros((max_len,))
    for i, ch in enumerate(smiles[:max_len]):
        encoded[i] = smiles_dict.get(ch, 0)
    return encoded

def encode_protein(sequence, max_len=1000):
    encoded = np.zeros((max_len,))
    for i, aa in enumerate(sequence[:max_len]):
        encoded[i] = protein_dict.get(aa, 0)
    return encoded

In [5]:
def generate_report(df, output_path="report.docx"):
    """
    Генерация отчёта в формате .docx
    :param df: DataFrame с результатами (Sequence, Predicted_Label, Predicted_Affinity)
    :param output_path: Путь для сохранения отчёта
    """

    # Создаём документ Word
    doc = Document()
    doc.add_heading("Автоматический отчёт: Анализ предсказанных лигандов", level=1)

    # Добавляем сводную статистику
    total = len(df)
    valid_smiles = df["Is_Valid"].sum()
    strong_binding = df[df["Predicted_Affinity"] >= 7.0]
    moderate_binding = df[(df["Predicted_Affinity"] >= 5.0) & (df["Predicted_Affinity"] < 7.0)]
    weak_binding = df[df["Predicted_Affinity"] < 5.0]

    summary_section = doc.add_paragraph()
    summary_section.add_run("📊 Сводная статистика\n").bold = True
    summary_section.add_run(f"Общее количество молекул: {total}\n")
    summary_section.add_run(f"Корректные SMILES: {valid_smiles} ({(valid_smiles / total * 100):.1f}%)\n")
    summary_section.add_run(f"Высокая аффинность (≥7.0): {len(strong_binding)}\n")
    summary_section.add_run(f"Умеренная аффинность (5.0–7.0): {len(moderate_binding)}\n")
    summary_section.add_run(f"Низкая аффинность (<5.0): {len(weak_binding)}\n")

    # Добавляем графики
    def plot_and_save(df_col, title, filename):
        plt.figure(figsize=(8, 4))
        plt.hist(df_col.dropna(), bins=20, color="skyblue", edgecolor="black")
        plt.title(title)
        plt.xlabel("Значение")
        plt.ylabel("Частота")
        plt.axvline(7.0, color="red", linestyle="--", label="Порог: 7.0")
        plt.legend()
        plt.savefig(filename, dpi=100)
        plt.close()

    # График аффинности
    plot_and_save(df["Predicted_Affinity"], "Распределение аффинности связывания", "affinity_distribution.png")
    doc.add_picture("affinity_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # График LogP
    plot_and_save(df["LogP"], "Распределение липофильности (LogP)", "logp_distribution.png")
    doc.add_picture("logp_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # График молекулярной массы
    plot_and_save(df["Molecular_Weight"], "Молекулярная масса", "mol_weight_distribution.png")
    doc.add_picture("mol_weight_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # ТОП-5 молекул
    top_molecules = df.sort_values(by="Predicted_Affinity", ascending=False).head(5)
    doc.add_heading("🏆 ТОП-5 молекул по аффинности", level=2)

    table = doc.add_table(rows=1, cols=5)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'ID'
    hdr_cells[1].text = 'Белковая последовательность'
    hdr_cells[2].text = 'SMILES'
    hdr_cells[3].text = 'Аффинность'
    hdr_cells[4].text = 'Липофильность (LogP)'

    for idx, row in top_molecules.iterrows():
        row_cells = table.add_row().cells
        row_cells[0].text = str(idx)
        row_cells[1].text = row['Sequence'][:30] + "..."
        row_cells[2].text = row['Predicted_Label']
        row_cells[3].text = f"{row['Predicted_Affinity']:.2f}"
        row_cells[4].text = f"{row['LogP']:.2f}"

    doc.add_paragraph("\n")

    # Визуализация топ-молекул
    from rdkit import Chem
    from rdkit.Chem import Draw
    import warnings
    warnings.filterwarnings("ignore")

    doc.add_heading("🖼️ Визуализация топ-3 молекул", level=2)
    for i, row in top_molecules.iterrows():
        mol = Chem.MolFromSmiles(row["Predicted_Label"])
        if mol:
            img_path = f"mol_{i}.png"
            Draw.MolToFile(mol, img_path, size=(300, 300))
            doc.add_paragraph(f"SMILES: {row['Predicted_Label']}")
            doc.add_picture(img_path)
            doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
            doc.add_paragraph(f"Аффинность: {row['Predicted_Affinity']:.2f}, LogP: {row['LogP']:.2f}")
        else:
            doc.add_paragraph(f"[Не валидный SMILES]: {row['Predicted_Label']}")

    # Сохраняем документ
    doc.save(output_path)
    print(f"Отчёт сохранён как {output_path}")


In [6]:
# df = pd.read_csv(r"C:\Users\eliza\graduate_work_rep\BioMol-Generator\check_results\check-deepseek_coder_results.csv")
# # Кодирование
# X_drug = np.array([encode_smiles(s) for s in df["Predicted_Label"]])
# X_target = np.array([encode_protein(s) for s in df["Sequence"]])

# # Предсказание
# predicted_affinity = model.predict([X_drug, X_target])
# df["Predicted_Affinity"] = predicted_affinity

# model_name = 'deepseek_coder'
# generate_report(df, output_path=LIGAND_PREDICTION_REPORT_PATH / f"ligand_prediction_report_{model_name}.docx")

In [9]:
df = pd.read_csv(r"C:\Users\eliza\graduate_work_rep\BioMol-Generator\check_results\check-t5_v2_test.csv")
# df = df.dropna(subset='Predicted_Label')
df

Unnamed: 0,Sequence,True_Label,Predicted_Label,Is_Valid,Properties,LogP,Molecular_Weight,Tanimoto_Similarity,Is_Acceptable_Tanimoto,ECFP4_Similarity,ECFP6_Similarity,MCS_Ratio,Is_Acceptable_MCS
0,MFIEAIVLALTALILYSVYSVKSFNTTRPTDPPVYPVTVPFLGHIV...,O=C(NCC(c1ccccc1)n1ccnc1)c1ccc(-c2ccc(Cl)cc2)cc1,=C(Oc(C1ccc(c1)c1cccc2c1cccc-c2ccccCl)cccc11,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False
1,MDSSTGPGNTSDCSDPLAQASCSPAPGSWLNLSHVDGNQSDPCGLN...,CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)CC[C@H]...,CCC[CCNC@H12O1c(c(ccC)ccc[C@H](OC@@]](C)[[C@@]...,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False
2,MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...,C[C@]12O[C@H](C[C@]1(O)CO)n1c3ccccc3c3c4c(c5c6...,CcC@@12O[C@H](C[C@]1(O)CO)n1c3ccccc3c3c4c(c5c6...,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False
3,MELENIVANSLLLKARQGGYGKKSGRSKKWKEILTLPPVSQCSELR...,CO[C@@H]1[C@H](N(C)C(=O)c2ccccc2)C[C@H]2O[C@]1...,CcC@@H]1[C@H](N(C)C(=O)c2ccccc2)C[C@H]2O[C@]1(...,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False
4,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2ncnc(Oc3cccc(NC(=O)Nc4cc(-c5ccccc5)on4)...,Cc1cc2ncnc(Nc3ccc((NC(=O)Nc4cccCc5ccccccn4)c3)...,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,COc1nc2ccc(C(O)(c3cnnn3C)c3cnc(C)n3C)cc2c(C#N)...,Cc1cc2ccc(C(O)(c3cncn3C)cccnn(C)n3C)cc2c(ClN)c...,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False
996,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...,COCCNC(=O)[C@@H]1CCCN1C(=O)CC(c1ccccc1)c1ccccc1,Cc((=O)cC@@H](CCN2C(=O)[CCC2ccc((1)C1ccccc1,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False
997,MEGTPAANWSVELDLGSGVPPGEEGNRTAGPPQRNEALARVEVAVL...,CCN1C(=O)CC2(CCCCC2)SSC[C@H](C(=O)N2CCC[C@H]2C...,c(CC(=O)N((CC(2)CC((C@@](Cc=O)N[CCCCCCC@H]2O(=...,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False
998,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,COc1nc2ccc(C(O)(c3cnc(C)n3C)c3cnc(C)n3C)cc2c(C...,Cc1cc2ccc(C(O)(c3cnc(C)n3C)cccncnC)n3C)cc2c(Cl...,False,"{'LogP': None, 'Molecular Weight': None}",,,0.0,False,0.0,0.0,0.0,False


In [10]:
df = pd.read_csv(r"C:\Users\eliza\graduate_work_rep\BioMol-Generator\check_results\check-t5_v2_test.csv")
# Кодирование
X_drug = np.array([encode_smiles(s) for s in df["Predicted_Label"]])
X_target = np.array([encode_protein(s) for s in df["Sequence"]])

# Предсказание
predicted_affinity = model.predict([X_drug, X_target])
df["Predicted_Affinity"] = predicted_affinity


model_name = 't5_small_v2'
generate_report(df, output_path=LIGAND_PREDICTION_REPORT_PATH / f"ligand_prediction_report_{model_name}.docx")

Отчёт сохранён как c:\users\eliza\graduate_work_rep\biomol-generator\check_results\ligand_prediction_report_t5_small_v2.docx


In [None]:
df = pd.read_csv(r"C:\Users\eliza\graduate_work_rep\BioMol-Generator\check_results\check-t5_v2_test.csv")
# Кодирование
X_drug = np.array([encode_smiles(s) for s in df["Predicted_Label"]])
X_target = np.array([encode_protein(s) for s in df["Sequence"]])

# Предсказание
predicted_affinity = model.predict([X_drug, X_target])
df["Predicted_Affinity"] = predicted_affinity
