In [1]:
import pandas as pd
from molecule_evaluator import MoleculeEvaluator
from data.constants import API_OUTPUT_FILES_PATH, FIRST_ANALYSIS_SMILES_PREDICTIONS_PATH, DATASET_PATH

  from rdkit.Chem import AllChem, MCS


In [2]:
df_gemini_file_path = API_OUTPUT_FILES_PATH / 'results_with_affinity_google_gemini-2.0-flash-001.csv'
df_mistral_file_path = API_OUTPUT_FILES_PATH / 'results_with_affinity_mistralai_mistral-large-2411.csv'
df_depseek_file_path = API_OUTPUT_FILES_PATH / 'results_with_affinity_deepseek_deepseek-coder.csv'

In [3]:
df_gemini = pd.read_csv(df_gemini_file_path)
df_mistral = pd.read_csv(df_mistral_file_path)
df_deepseek = pd.read_csv(df_depseek_file_path)
df_gemini.head()

Unnamed: 0,Sequence,Generated_SMILES,Predicted_Affinity,Attempts
0,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,Cc1ccc(C(=O)Nc2ccc(Cl)cc2)cc1,4.145286,0
1,MASPDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLK...,Cc1ccc(C(=O)Nc2ccc(C(=O)O)cc2)cc1,4.21253,0
2,MEPAPSAGAELQPPLFANASDAYPSACPSAGANASGPPGARSASSL...,CN1C=NC2=C1C(=O)N(C(=O)N2c1ccccc1)c1ccc(OCC(N)...,4.304656,0
3,MDSSTGPGNTSDCSDPLAQASCSPAPGSWLNLSHVDGNQSDPCGLN...,CN1C=NC2=C1C(=O)N(C(=O)N2c1ccccc1)Cc1ccccc1,4.324585,1
4,MDVVDSLLVNGSNITPPCELGLENETLFCLDQPRPSKEWQPAVQIL...,Cc1ccc(C(=O)Nc2cc(OC)c(OC)cc2)cc1,4.28652,0


In [4]:
def found_generated_smiles(sequence, df):
    generated_smiles = df["Generated_SMILES"][df["Sequence"] == sequence].squeeze()
    return generated_smiles

## метрика - значение
Valid_Molecules - Все сгенерированные молекулы — валидные (корректные) SMILES
Total_Molecules - Общее количество проверенных молекул
Valid_Ratio - Процент валидных химических молекул
Acceptable_Molecules - количество молекул прошедших порог Tanimoto Similarity ≥ 0.8
Acceptable_Ratio -  % считающихся "допустимыми" по критерию Tanimoto Similarity

In [5]:
if __name__ == "__main__":
    raw_df = pd.read_parquet(DATASET_PATH)
    raw_df["Generated_SMILES"] = raw_df["Target"].apply(lambda x: found_generated_smiles(x, df_gemini))
    raw_df.to_csv(DATASET_PATH.parent / f'raw_{df_gemini_file_path.name}', index=False)
    input_file=str(DATASET_PATH.parent / f'raw_{df_gemini_file_path.name}')
    folder_path = FIRST_ANALYSIS_SMILES_PREDICTIONS_PATH / f'check-{df_gemini_file_path.name}'
    folder_path.mkdir(parents=True, exist_ok=True)
    output_file=str(folder_path / f'check-{df_gemini_file_path.name}')

    evaluator = MoleculeEvaluator(input_file, output_file)

    summary, results_df = evaluator.run()

    print("Summary Statistics:")
    for key, value in summary.items():
        print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")

In [None]:
if __name__ == "__main__":
    raw_df = pd.read_parquet(DATASET_PATH)
    raw_df["Generated_SMILES"] = raw_df["Target"].apply(lambda x: found_generated_smiles(x, df_mistral))
    raw_df.to_csv(DATASET_PATH.parent / f'raw_{df_mistral_file_path.name}', index=False)
    input_file=str(DATASET_PATH.parent / f'raw_{df_mistral_file_path.name}')
    folder_path = FIRST_ANALYSIS_SMILES_PREDICTIONS_PATH / f'check-{df_mistral_file_path.name}'
    folder_path.mkdir(parents=True, exist_ok=True)
    output_file=str(folder_path / f'check-{df_mistral_file_path.name}')

    evaluator = MoleculeEvaluator(input_file, output_file)

    summary, results_df = evaluator.run()

    print("Summary Statistics:")
    for key, value in summary.items():
        print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")

In [None]:
if __name__ == "__main__":
    raw_df = pd.read_parquet(DATASET_PATH)
    raw_df["Generated_SMILES"] = raw_df["Target"].apply(lambda x: found_generated_smiles(x, df_deepseek))
    raw_df.to_csv(DATASET_PATH.parent / f'raw_{df_depseek_file_path.name}', index=False)
    input_file=str(DATASET_PATH.parent / f'raw_{df_depseek_file_path.name}')
    folder_path = FIRST_ANALYSIS_SMILES_PREDICTIONS_PATH / f'check-{df_depseek_file_path.name}'
    folder_path.mkdir(parents=True, exist_ok=True)
    output_file=str(folder_path / f'check-{df_depseek_file_path.name}')

    evaluator = MoleculeEvaluator(input_file, output_file)

    summary, results_df = evaluator.run()

    print("Summary Statistics:")
    for key, value in summary.items():
        print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")