In [67]:
import pandas as pd
from pathlib import Path
from loguru import logger
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score

from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

import joblib

In [68]:
def load_and_merge_dye_data(data_dir: Path) -> pd.DataFrame:
    """加载和合并荧光染料数据集"""
    fluor_data_dir = data_dir / "dyes"
    
    # 检查目录是否存在
    if not fluor_data_dir.exists() or not fluor_data_dir.is_dir():
        logger.error(f"目录 {fluor_data_dir} 不存在或不是有效的目录。")
        return pd.DataFrame()  # 返回空 DataFrame

    # 文件名列表
    files = [
        "Dataset_Consolidation_canonicalized.csv",
        "Dataset_Cyanine_canonicalized.csv",
        "Dataset_Xanthene_canonicalized.csv",
    ]

    # 检查文件是否存在
    missing_files = [f for f in files if not (fluor_data_dir / f).exists()]
    if missing_files:
        logger.error(f"以下文件缺失: {', '.join(missing_files)}")
        return pd.DataFrame()  # 返回空 DataFrame

    # 加载数据
    dfs = []
    for file in files:
        try:
            df = pd.read_csv(fluor_data_dir / file)
            dfs.append(df)
            logger.info(f"成功加载文件: {file}")
        except Exception as e:
            logger.error(f"加载文件 {file} 时出错: {e}")
            return pd.DataFrame()  # 出错时返回空 DataFrame

    # 合并数据并去重
    merged_df = pd.concat(dfs, ignore_index=True).reset_index(drop=True)
    n_stokes_shift = 0
    n_emission_absorption = 0
    for index, row in merged_df.iterrows():
        if row["stokes_shift"]:
            n_stokes_shift += 1
            continue
        elif row["emission"] and row["absorption"]:
            n_emission_absorption += 1
            merged_df.loc[index, "stokes_shift"] = row["emission"] - row["absorption"]
        else:
            merged_df.loc[index, "stokes_shift"] = None
            
    logger.info(f"Stokes shift 计算成功: {n_stokes_shift} 条数据，{n_emission_absorption} 条数据需要计算。")
    # 选择所需列
    merged_df = merged_df[["smiles", "absorption","emission","stokes_shift", "quantum_yield"]]
    merged_df.dropna(inplace=True)
    merged_df = merged_df.drop_duplicates(subset=["smiles"], keep="first")
    
    # 重置索引
    merged_df.reset_index(drop=True, inplace=True)
    
    # 输出加载的数据量
    logger.info(f"已加载 {len(merged_df)} 条唯一 SMILES 数据。")
    return merged_df

In [69]:
data_dir = Path("../data")
df = load_and_merge_dye_data(data_dir)

[32m2025-04-22 12:37:42.513[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_and_merge_dye_data[0m:[36m29[0m - [1m成功加载文件: Dataset_Consolidation_canonicalized.csv[0m
[32m2025-04-22 12:37:42.515[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_and_merge_dye_data[0m:[36m29[0m - [1m成功加载文件: Dataset_Cyanine_canonicalized.csv[0m
[32m2025-04-22 12:37:42.517[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_and_merge_dye_data[0m:[36m29[0m - [1m成功加载文件: Dataset_Xanthene_canonicalized.csv[0m


[32m2025-04-22 12:37:43.019[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_and_merge_dye_data[0m:[36m48[0m - [1mStokes shift 计算成功: 39398 条数据，0 条数据需要计算。[0m
[32m2025-04-22 12:37:43.022[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_and_merge_dye_data[0m:[36m58[0m - [1m已加载 6703 条唯一 SMILES 数据。[0m


In [70]:
df.describe()

Unnamed: 0,absorption,emission,stokes_shift,quantum_yield
count,6703.0,6703.0,6703.0,6703.0
mean,453.898255,530.709085,76.811428,0.350122
std,117.823913,107.912885,50.847197,0.288046
min,222.0,247.0,1.0,4e-05
25%,367.0,447.0,33.0,0.084
50%,423.0,521.0,70.0,0.29
75%,519.0,598.0,111.0,0.6
max,943.0,1097.0,325.0,1.0


In [76]:

def smiles_to_morgan_fingerprint(smiles: str, radius: int = 2, n_bits: int = 2048):
    """将 SMILES 字符串转化为摩根指纹"""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)  # 无效 SMILES，返回零向量
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fingerprint)

def train_lightgbm_model(df: pd.DataFrame):
    """使用 LightGBM 和 MultiOutputRegressor 训练模型同时预测 absorption、emission、stokes_shift 和 quantum_yield"""
    
    # 1. 数据预处理
    logger.info("Converting SMILES to Morgan Fingerprints...")
    fingerprints = df["smiles"].apply(smiles_to_morgan_fingerprint)
    X = pd.DataFrame(fingerprints.tolist())  # 转成 DataFrame，保持特征名一致

    y = df[["absorption", "emission", "stokes_shift", "quantum_yield"]]

    # 2. 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42
    )

    # 3. 模型定义与训练
    logger.info("Training model for absorption, emission, stokes_shift, and quantum_yield...")
    lgb_model = lgb.LGBMRegressor(objective="regression")
    multioutput_model = MultiOutputRegressor(lgb_model)

    multioutput_model.fit(X_train, y_train)

    # 4. 预测与评估
    logger.info("Making predictions on the test set...")
    y_pred = multioutput_model.predict(X_test)

    metrics = {}
    targets = ["absorption", "emission", "stokes_shift", "quantum_yield"]
    for idx, target in enumerate(targets):
        mse = mean_squared_error(y_test[target], y_pred[:, idx])
        r2 = r2_score(y_test[target], y_pred[:, idx])
        metrics[target] = {"MSE": mse, "R2": r2}
        logger.info(f"{target} - MSE: {mse:.4f}, R2: {r2:.4f}")

    return multioutput_model, metrics

In [79]:
model, _ = train_lightgbm_model(df)

[32m2025-04-22 12:40:45.088[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_lightgbm_model[0m:[36m13[0m - [1mConverting SMILES to Morgan Fingerprints...[0m
[32m2025-04-22 12:40:58.411[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_lightgbm_model[0m:[36m25[0m - [1mTraining model for absorption, emission, stokes_shift, and quantum_yield...[0m


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3242
[LightGBM] [Info] Number of data points in the train set: 6032, number of used features: 1621
[LightGBM] [Info] Start training from score 455.088031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3242
[LightGBM] [Info] Number of data points in the train set: 6032, number of used features: 1621
[LightGBM] [Info] Start training from score 531.524038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[32m2025-04-22 12:41:00.114[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_lightgbm_model[0m:[36m32[0m - [1mMaking predictions on the test set...[0m
[32m2025-04-22 12:41:00.127[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_lightgbm_model[0m:[36m41[0m - [1mabsorption - MSE: 1456.0986, R2: 0.8839[0m
[32m2025-04-22 12:41:00.128[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_lightgbm_model[0m:[36m41[0m - [1memission - MSE: 1847.3987, R2: 0.8274[0m
[32m2025-04-22 12:41:00.128[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_lightgbm_model[0m:[36m41[0m - [1mstokes_shift - MSE: 881.0986, R2: 0.6629[0m
[32m2025-04-22 12:41:00.129[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_lightgbm_model[0m:[36m41[0m - [1mquantum_yield - MSE: 0.0444, R2: 0.4833[0m


In [80]:
with open("../dyeles/resources/lightgbm.pkl", "wb") as f:
    joblib.dump(model, f)