In [1]:
from keras.models import load_model
import pandas as pd
import numpy as np

import os
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors

from rdkit.Chem import Draw
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

from pathlib import Path

from data.constants import FIRST_ANALYSIS_SMILES_PREDICTIONS_PATH, BAPULM_RESULT

In [2]:
def generate_report(df, output_path="report.docx"):
    """
    Генерация отчёта в формате .docx
    :param df: DataFrame с результатами (Sequence, Predicted_Label, Predicted_Affinity)
    :param output_path: Путь для сохранения отчёта
    """

    # Создаём документ Word
    doc = Document()
    doc.add_heading("Автоматический отчёт: Анализ предсказанных лигандов", level=1)

    # Добавляем сводную статистику
    total = len(df)
    valid_smiles = df["Is_Valid"].sum()
    strong_binding = df[df["predicted_neg_log10_affinity_M"] >= 6.45]
    moderate_binding = df[(df["predicted_neg_log10_affinity_M"] >= 5.0) & (df["predicted_neg_log10_affinity_M"] < 6.45)]
    weak_binding = df[df["predicted_neg_log10_affinity_M"] < 5.0]

    summary_section = doc.add_paragraph()
    summary_section.add_run("📊 Сводная статистика\n").bold = True
    summary_section.add_run(f"Общее количество молекул: {total}\n")
    summary_section.add_run(f"Корректные SMILES: {valid_smiles} ({(valid_smiles / total * 100):.1f}%)\n")
    summary_section.add_run(f"Высокая аффинность (≥6.45): {len(strong_binding)}\n")
    summary_section.add_run(f"Умеренная аффинность (5.0–6.45): {len(moderate_binding)}\n")
    summary_section.add_run(f"Низкая аффинность (<5.0): {len(weak_binding)}\n")

    # Добавляем графики
    def plot_and_save(df_col, title, filename):
        plt.figure(figsize=(8, 4))
        plt.hist(df_col.dropna(), bins=20, color="skyblue", edgecolor="black")
        plt.title(title)
        plt.xlabel("Значение")
        plt.ylabel("Частота")
        plt.axvline(6.45, color="red", linestyle="--", label="Порог: 6.45")
        plt.legend()
        plt.savefig(filename, dpi=100)
        plt.close()

    # График аффинности
    plot_and_save(df["predicted_neg_log10_affinity_M"], "Распределение аффинности связывания", "affinity_distribution.png")
    doc.add_picture("affinity_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # График LogP
    df_filtered_logp = df[(df["LogP"] >= -2) & (df["LogP"] <= 5)]
    plot_and_save(df_filtered_logp["LogP"], "Распределение липофильности (LogP) в диапазоне от -2 до 5", "logp_distribution.png")
    doc.add_picture("logp_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # График молекулярной массы
    plot_and_save(df["Molecular_Weight"], "Молекулярная масса", "mol_weight_distribution.png")
    doc.add_picture("mol_weight_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # ТОП-5 молекул
    top_molecules = df.sort_values(by="predicted_neg_log10_affinity_M", ascending=False).head(5)
    doc.add_heading("🏆 ТОП-5 молекул по аффинности", level=2)

    table = doc.add_table(rows=1, cols=5)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'ID'
    hdr_cells[1].text = 'Белковая последовательность'
    hdr_cells[2].text = 'SMILES'
    hdr_cells[3].text = 'Аффинность'
    hdr_cells[4].text = 'Липофильность (LogP)'

    for idx, row in top_molecules.iterrows():
        row_cells = table.add_row().cells
        row_cells[0].text = str(idx)
        row_cells[1].text = row['Target'][:30] + "..."
        row_cells[2].text = row['ligand_smiles']
        row_cells[3].text = f"{row['predicted_neg_log10_affinity_M']:.2f}"
        row_cells[4].text = f"{row['LogP']:.2f}"

    doc.add_paragraph("\n")

    # Визуализация топ-молекул
    from rdkit import Chem
    from rdkit.Chem import Draw
    import warnings
    warnings.filterwarnings("ignore")

    doc.add_heading("🖼️ Визуализация топ-3 молекул", level=2)
    for i, row in top_molecules.iterrows():
        mol = Chem.MolFromSmiles(row["ligand_smiles"])
        if mol:
            img_path = f"mol_{i}.png"
            Draw.MolToFile(mol, img_path, size=(300, 300))
            doc.add_paragraph(f"SMILES: {row['ligand_smiles']}")
            doc.add_picture(img_path)
            doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
            doc.add_paragraph(f"Аффинность: {row['predicted_neg_log10_affinity_M']:.2f}, LogP: {row['LogP']:.2f}")
        else:
            doc.add_paragraph(f"[Не валидный SMILES]: {row['ligand_smiles']}")

    # Сохраняем документ
    doc.save(output_path)
    print(f"Отчёт сохранён как {output_path}")

In [3]:
# Функция для проверки валидности SMILES
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

# Функция для расчета физико-химических свойств
def calculate_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {'LogP': None, 'Molecular_Weight': None}
    logp = Descriptors.MolLogP(mol)
    mw = Descriptors.MolWt(mol)
    return {'LogP': logp, 'Molecular_Weight': mw}

In [4]:
df = pd.read_csv(BAPULM_RESULT / 'results_with_affinity_google_gemini-2.0-flash-001_predictions.csv')
df['Is_Valid'] = df['ligand_smiles'].apply(is_valid_smiles)
df['Properties'] = df['ligand_smiles'].apply(calculate_properties)
df['LogP'] = df['Properties'].apply(lambda x: x['LogP'])
df['Molecular_Weight'] = df['Properties'].apply(lambda x: x['Molecular_Weight'])
df["Target"] = df["protein_seq"].apply(lambda x: ''.join(x.split(' ')))
df.head(3)

Unnamed: 0,protein_seq,ligand_smiles,predicted_neg_log10_affinity_M,Is_Valid,Properties,LogP,Molecular_Weight,Target
0,M S H H W G Y G K H N G P E H W H K D F P I A ...,Cc1ccc(C(=O)Nc2ccc(Cl)cc2)cc1,6.555834,True,"{'LogP': 3.9007200000000024, 'Molecular_Weight...",3.90072,245.709,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
1,M A S P D W G Y D D K N G P E Q W S K L Y P I ...,Cc1ccc(C(=O)Nc2ccc(C(=O)O)cc2)cc1,5.183453,True,"{'LogP': 2.945520000000001, 'Molecular_Weight'...",2.94552,255.273,MASPDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLK...
2,M E P A P S A G A E L Q P P L F A N A S D A Y ...,CN1C=NC2=C1C(=O)N(C(=O)N2c1ccccc1)c1ccc(OCC(N)...,5.113006,True,"{'LogP': 0.7391999999999993, 'Molecular_Weight...",0.7392,391.387,MEPAPSAGAELQPPLFANASDAYPSACPSAGANASGPPGARSASSL...


In [5]:
def found_generated_smiles(sequence, df):
    generated_smiles = df["Predicted_Affinity"][df["Sequence"] == sequence].squeeze()
    return generated_smiles

In [6]:

model_name = 'google_gemini-2.0-flash-001'
generate_report(df, output_path=FIRST_ANALYSIS_SMILES_PREDICTIONS_PATH / 'check-results_with_affinity_google_gemini-2.0-flash-001.csv' / f"ligand_prediction_report_{model_name}.docx")

Отчёт сохранён как e:\graduate_work_rep\biomol-generator\data\check_predicted_smiles\check-results_with_affinity_google_gemini-2.0-flash-001.csv\ligand_prediction_report_google_gemini-2.0-flash-001.docx
