In [1]:
from keras.models import load_model
import pandas as pd
import numpy as np

import os
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors

from rdkit.Chem import Draw
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

from pathlib import Path

from data.constants import FIRST_ANALYSIS_SMILES_PREDICTIONS_PATH, BAPULM_RESULT

In [2]:
def generate_report(df, output_path="report.docx"):
    """
    –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç—á—ë—Ç–∞ –≤ —Ñ–æ—Ä–º–∞—Ç–µ .docx
    :param df: DataFrame —Å —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞–º–∏ (Sequence, Predicted_Label, Predicted_Affinity)
    :param output_path: –ü—É—Ç—å –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –æ—Ç—á—ë—Ç–∞
    """

    # –°–æ–∑–¥–∞—ë–º –¥–æ–∫—É–º–µ–Ω—Ç Word
    doc = Document()
    doc.add_heading("–ê–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏–π –æ—Ç—á—ë—Ç: –ê–Ω–∞–ª–∏–∑ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã—Ö –ª–∏–≥–∞–Ω–¥–æ–≤", level=1)

    # –î–æ–±–∞–≤–ª—è–µ–º —Å–≤–æ–¥–Ω—É—é —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫—É
    total = len(df)
    valid_smiles = df["Is_Valid"].sum()
    strong_binding = df[df["predicted_neg_log10_affinity_M"] >= 6.45]
    moderate_binding = df[(df["predicted_neg_log10_affinity_M"] >= 5.0) & (df["predicted_neg_log10_affinity_M"] < 6.45)]
    weak_binding = df[df["predicted_neg_log10_affinity_M"] < 5.0]

    summary_section = doc.add_paragraph()
    summary_section.add_run("üìä –°–≤–æ–¥–Ω–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞\n").bold = True
    summary_section.add_run(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –º–æ–ª–µ–∫—É–ª: {total}\n")
    summary_section.add_run(f"–ö–æ—Ä—Ä–µ–∫—Ç–Ω—ã–µ SMILES: {valid_smiles} ({(valid_smiles / total * 100):.1f}%)\n")
    summary_section.add_run(f"–í—ã—Å–æ–∫–∞—è –∞—Ñ—Ñ–∏–Ω–Ω–æ—Å—Ç—å (‚â•6.45): {len(strong_binding)}\n")
    summary_section.add_run(f"–£–º–µ—Ä–µ–Ω–Ω–∞—è –∞—Ñ—Ñ–∏–Ω–Ω–æ—Å—Ç—å (5.0‚Äì6.45): {len(moderate_binding)}\n")
    summary_section.add_run(f"–ù–∏–∑–∫–∞—è –∞—Ñ—Ñ–∏–Ω–Ω–æ—Å—Ç—å (<5.0): {len(weak_binding)}\n")

    # –î–æ–±–∞–≤–ª—è–µ–º –≥—Ä–∞—Ñ–∏–∫–∏
    def plot_and_save(df_col, title, filename):
        plt.figure(figsize=(8, 4))
        plt.hist(df_col.dropna(), bins=20, color="skyblue", edgecolor="black")
        plt.title(title)
        plt.xlabel("–ó–Ω–∞—á–µ–Ω–∏–µ")
        plt.ylabel("–ß–∞—Å—Ç–æ—Ç–∞")
        plt.axvline(6.45, color="red", linestyle="--", label="–ü–æ—Ä–æ–≥: 6.45")
        plt.legend()
        plt.savefig(filename, dpi=100)
        plt.close()

    # –ì—Ä–∞—Ñ–∏–∫ –∞—Ñ—Ñ–∏–Ω–Ω–æ—Å—Ç–∏
    plot_and_save(df["predicted_neg_log10_affinity_M"], "–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∞—Ñ—Ñ–∏–Ω–Ω–æ—Å—Ç–∏ —Å–≤—è–∑—ã–≤–∞–Ω–∏—è", "affinity_distribution.png")
    doc.add_picture("affinity_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # –ì—Ä–∞—Ñ–∏–∫ LogP
    df_filtered_logp = df[(df["LogP"] >= -2) & (df["LogP"] <= 5)]
    plot_and_save(df_filtered_logp["LogP"], "–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ª–∏–ø–æ—Ñ–∏–ª—å–Ω–æ—Å—Ç–∏ (LogP) –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ –æ—Ç -2 –¥–æ 5", "logp_distribution.png")
    doc.add_picture("logp_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # –ì—Ä–∞—Ñ–∏–∫ –º–æ–ª–µ–∫—É–ª—è—Ä–Ω–æ–π –º–∞—Å—Å—ã
    plot_and_save(df["Molecular_Weight"], "–ú–æ–ª–µ–∫—É–ª—è—Ä–Ω–∞—è –º–∞—Å—Å–∞", "mol_weight_distribution.png")
    doc.add_picture("mol_weight_distribution.png")
    doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    # –¢–û–ü-5 –º–æ–ª–µ–∫—É–ª
    top_molecules = df.sort_values(by="predicted_neg_log10_affinity_M", ascending=False).head(5)
    doc.add_heading("üèÜ –¢–û–ü-5 –º–æ–ª–µ–∫—É–ª –ø–æ –∞—Ñ—Ñ–∏–Ω–Ω–æ—Å—Ç–∏", level=2)

    table = doc.add_table(rows=1, cols=5)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'ID'
    hdr_cells[1].text = '–ë–µ–ª–∫–æ–≤–∞—è –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç—å'
    hdr_cells[2].text = 'SMILES'
    hdr_cells[3].text = '–ê—Ñ—Ñ–∏–Ω–Ω–æ—Å—Ç—å'
    hdr_cells[4].text = '–õ–∏–ø–æ—Ñ–∏–ª—å–Ω–æ—Å—Ç—å (LogP)'

    for idx, row in top_molecules.iterrows():
        row_cells = table.add_row().cells
        row_cells[0].text = str(idx)
        row_cells[1].text = row['Target'][:30] + "..."
        row_cells[2].text = row['ligand_smiles']
        row_cells[3].text = f"{row['predicted_neg_log10_affinity_M']:.2f}"
        row_cells[4].text = f"{row['LogP']:.2f}"

    doc.add_paragraph("\n")

    # –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è —Ç–æ–ø-–º–æ–ª–µ–∫—É–ª
    from rdkit import Chem
    from rdkit.Chem import Draw
    import warnings
    warnings.filterwarnings("ignore")

    doc.add_heading("üñºÔ∏è –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è —Ç–æ–ø-3 –º–æ–ª–µ–∫—É–ª", level=2)
    for i, row in top_molecules.iterrows():
        mol = Chem.MolFromSmiles(row["ligand_smiles"])
        if mol:
            img_path = f"mol_{i}.png"
            Draw.MolToFile(mol, img_path, size=(300, 300))
            doc.add_paragraph(f"SMILES: {row['ligand_smiles']}")
            doc.add_picture(img_path)
            doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
            doc.add_paragraph(f"–ê—Ñ—Ñ–∏–Ω–Ω–æ—Å—Ç—å: {row['predicted_neg_log10_affinity_M']:.2f}, LogP: {row['LogP']:.2f}")
        else:
            doc.add_paragraph(f"[–ù–µ –≤–∞–ª–∏–¥–Ω—ã–π SMILES]: {row['ligand_smiles']}")

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –¥–æ–∫—É–º–µ–Ω—Ç
    doc.save(output_path)
    print(f"–û—Ç—á—ë—Ç —Å–æ—Ö—Ä–∞–Ω—ë–Ω –∫–∞–∫ {output_path}")

In [3]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏ –≤–∞–ª–∏–¥–Ω–æ—Å—Ç–∏ SMILES
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ —Ñ–∏–∑–∏–∫–æ-—Ö–∏–º–∏—á–µ—Å–∫–∏—Ö —Å–≤–æ–π—Å—Ç–≤
def calculate_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {'LogP': None, 'Molecular_Weight': None}
    logp = Descriptors.MolLogP(mol)
    mw = Descriptors.MolWt(mol)
    return {'LogP': logp, 'Molecular_Weight': mw}

In [4]:
df = pd.read_csv(BAPULM_RESULT / 'results_with_affinity_google_gemini-2.0-flash-001_predictions.csv')
df['Is_Valid'] = df['ligand_smiles'].apply(is_valid_smiles)
df['Properties'] = df['ligand_smiles'].apply(calculate_properties)
df['LogP'] = df['Properties'].apply(lambda x: x['LogP'])
df['Molecular_Weight'] = df['Properties'].apply(lambda x: x['Molecular_Weight'])
df["Target"] = df["protein_seq"].apply(lambda x: ''.join(x.split(' ')))
df.head(3)

Unnamed: 0,protein_seq,ligand_smiles,predicted_neg_log10_affinity_M,Is_Valid,Properties,LogP,Molecular_Weight,Target
0,M S H H W G Y G K H N G P E H W H K D F P I A ...,Cc1ccc(C(=O)Nc2ccc(Cl)cc2)cc1,6.555834,True,"{'LogP': 3.9007200000000024, 'Molecular_Weight...",3.90072,245.709,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
1,M A S P D W G Y D D K N G P E Q W S K L Y P I ...,Cc1ccc(C(=O)Nc2ccc(C(=O)O)cc2)cc1,5.183453,True,"{'LogP': 2.945520000000001, 'Molecular_Weight'...",2.94552,255.273,MASPDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLK...
2,M E P A P S A G A E L Q P P L F A N A S D A Y ...,CN1C=NC2=C1C(=O)N(C(=O)N2c1ccccc1)c1ccc(OCC(N)...,5.113006,True,"{'LogP': 0.7391999999999993, 'Molecular_Weight...",0.7392,391.387,MEPAPSAGAELQPPLFANASDAYPSACPSAGANASGPPGARSASSL...


In [5]:
def found_generated_smiles(sequence, df):
    generated_smiles = df["Predicted_Affinity"][df["Sequence"] == sequence].squeeze()
    return generated_smiles

In [6]:

model_name = 'google_gemini-2.0-flash-001'
generate_report(df, output_path=FIRST_ANALYSIS_SMILES_PREDICTIONS_PATH / 'check-results_with_affinity_google_gemini-2.0-flash-001.csv' / f"ligand_prediction_report_{model_name}.docx")

–û—Ç—á—ë—Ç —Å–æ—Ö—Ä–∞–Ω—ë–Ω –∫–∞–∫ e:\graduate_work_rep\biomol-generator\data\check_predicted_smiles\check-results_with_affinity_google_gemini-2.0-flash-001.csv\ligand_prediction_report_google_gemini-2.0-flash-001.docx
