In [1]:
import pubchempy as pcp
from typing import Optional


def validate_against_pubchem(smiles, verbose: bool = False, idx: Optional[int] = None, total: Optional[int] = None):
    """验证生成的SMILES是否在PubChem中存在
    参数:
        smiles: str
        verbose: 是否实时打印查询进度与结果
        idx: 当前序号（可选）
        total: 总数（可选）
    返回:
        dict: 包含是否存在、CID、IUPAC名称和分子量等信息
    """
    prefix = f"[{idx}/{total}] " if (idx is not None and total is not None) else ""
    try:
        if verbose:
            print(f"{prefix}查询 PubChem: {smiles}", flush=True)
        results = pcp.get_compounds(smiles, 'smiles')
        if results:
            compound = results[0]
            info = {
                'smiles': smiles,
                'exists': True,
                'cid': compound.cid,
                'name': compound.iupac_name,
                'molecular_weight': compound.molecular_weight
            }
            if verbose:
                nm = info['name'] if info['name'] else '(无 IUPAC 名称)'
                print(f"{prefix}找到: CID={info['cid']}, 名称={nm}, 分子量={info['molecular_weight']}", flush=True)
            return info
    except Exception as e:
        if verbose:
            print(f"{prefix}查询出错: {e}", flush=True)
        return {'smiles': smiles, 'exists': False, 'error': str(e)}
    # 未找到
    if verbose:
        print(f"{prefix}未找到该分子", flush=True)
    return {'smiles': smiles, 'exists': False}

In [2]:
import pandas as pd

df = pd.read_csv('data/r4n_smiles_c16.csv')
lists = df['SMILES'].tolist()

# 实时输出：传入 idx 和 total，并开启 verbose
results = [
    validate_against_pubchem(smiles, verbose=True, idx=i, total=len(lists))
    for i, smiles in enumerate(lists, start=1)
]
results_df = pd.DataFrame(results)


[1/863] 查询 PubChem: C[N+](C)(C)C
[1/863] 找到: CID=6380, 名称=tetramethylazanium, 分子量=74.14
[2/863] 查询 PubChem: CC[N+](C)(C)C
[2/863] 找到: CID=33993, 名称=ethyl(trimethyl)azanium, 分子量=88.17
[3/863] 查询 PubChem: CC(C)[N+](C)(C)C
[3/863] 找到: CID=12127913, 名称=trimethyl(propan-2-yl)azanium, 分子量=102.2
[4/863] 查询 PubChem: CCC[N+](C)(C)C
[4/863] 找到: CID=29385, 名称=trimethyl(propyl)azanium, 分子量=102.2
[5/863] 查询 PubChem: CC[N+](C)(C)CC
[5/863] 找到: CID=199517, 名称=diethyl(dimethyl)azanium, 分子量=102.2
[6/863] 查询 PubChem: CC(C)(C)[N+](C)(C)C
[6/863] 找到: CID=14157076, 名称=tert-butyl(trimethyl)azanium, 分子量=116.22
[7/863] 查询 PubChem: CC(C)C[N+](C)(C)C
[7/863] 找到: CID=15919129, 名称=trimethyl(2-methylpropyl)azanium, 分子量=116.22
[8/863] 查询 PubChem: CCCC[N+](C)(C)C
[8/863] 找到: CID=24399, 名称=butyl(trimethyl)azanium, 分子量=116.22
[9/863] 查询 PubChem: CCC[N+](C)(C)CC
[9/863] 找到: CID=158452, 名称=ethyl-dimethyl-propylazanium, 分子量=116.22
[10/863] 查询 PubChem: CC[N+](C)(C)C(C)C
[10/863] 找到: CID=14022623, 名称=ethyl-dimethyl-propan-

In [3]:
results_df

Unnamed: 0,smiles,exists,cid,name,molecular_weight
0,C[N+](C)(C)C,True,6380.0,tetramethylazanium,74.14
1,CC[N+](C)(C)C,True,33993.0,ethyl(trimethyl)azanium,88.17
2,CC(C)[N+](C)(C)C,True,12127913.0,trimethyl(propan-2-yl)azanium,102.20
3,CCC[N+](C)(C)C,True,29385.0,trimethyl(propyl)azanium,102.20
4,CC[N+](C)(C)CC,True,199517.0,diethyl(dimethyl)azanium,102.20
...,...,...,...,...,...
858,CCC[N+](CCC)(CC(C)CC)CC(C)CC,True,,,
859,CCC[N+](CCC)(CCC(C)C)CC(C)CC,True,,,
860,CCC[N+](CCC)(CCC(C)C)CCC(C)C,True,127262902.0,bis(3-methylbutyl)-dipropylazanium,242.46
861,CC[N+](CCC(C)C)(CCC(C)C)C(C)(C)C,True,,,


In [6]:
# df和results_df直接拼接
final_df = pd.concat([df, results_df], axis=1)
final_df

Unnamed: 0,Index,Num_c,SMILES,smiles,exists,cid,name,molecular_weight
0,1,4,C[N+](C)(C)C,C[N+](C)(C)C,True,6380.0,tetramethylazanium,74.14
1,2,5,CC[N+](C)(C)C,CC[N+](C)(C)C,True,33993.0,ethyl(trimethyl)azanium,88.17
2,3,6,CC(C)[N+](C)(C)C,CC(C)[N+](C)(C)C,True,12127913.0,trimethyl(propan-2-yl)azanium,102.20
3,4,6,CCC[N+](C)(C)C,CCC[N+](C)(C)C,True,29385.0,trimethyl(propyl)azanium,102.20
4,5,6,CC[N+](C)(C)CC,CC[N+](C)(C)CC,True,199517.0,diethyl(dimethyl)azanium,102.20
...,...,...,...,...,...,...,...,...
858,859,16,CCC[N+](CCC)(CC(C)CC)CC(C)CC,CCC[N+](CCC)(CC(C)CC)CC(C)CC,True,,,
859,860,16,CCC[N+](CCC)(CCC(C)C)CC(C)CC,CCC[N+](CCC)(CCC(C)C)CC(C)CC,True,,,
860,861,16,CCC[N+](CCC)(CCC(C)C)CCC(C)C,CCC[N+](CCC)(CCC(C)C)CCC(C)C,True,127262902.0,bis(3-methylbutyl)-dipropylazanium,242.46
861,862,16,CC[N+](CCC(C)C)(CCC(C)C)C(C)(C)C,CC[N+](CCC(C)C)(CCC(C)C)C(C)(C)C,True,,,


In [8]:
# 保存结果
final_df = final_df.drop(columns=['smiles', 'exists'])

In [9]:
# 删去cid为NaN的行
final_df = final_df.dropna(subset=['cid'])
final_df

Unnamed: 0,Index,Num_c,SMILES,cid,name,molecular_weight
0,1,4,C[N+](C)(C)C,6380.0,tetramethylazanium,74.14
1,2,5,CC[N+](C)(C)C,33993.0,ethyl(trimethyl)azanium,88.17
2,3,6,CC(C)[N+](C)(C)C,12127913.0,trimethyl(propan-2-yl)azanium,102.20
3,4,6,CCC[N+](C)(C)C,29385.0,trimethyl(propyl)azanium,102.20
4,5,6,CC[N+](C)(C)CC,199517.0,diethyl(dimethyl)azanium,102.20
...,...,...,...,...,...,...
843,844,16,CCCC[N+](CCCC)(CC(C)C)CC(C)C,57477251.0,dibutyl-bis(2-methylpropyl)azanium,242.46
846,847,16,CCCC[N+](CCCC)(CCCC)C(C)(C)C,124089705.0,tributyl(tert-butyl)azanium,242.46
847,848,16,CCCC[N+](CCCC)(CCCC)CC(C)C,19858534.0,tributyl(2-methylpropyl)azanium,242.46
848,849,16,CCCC[N+](CCCC)(CCCC)CCCC,16028.0,tetrabutylazanium,242.46


In [10]:
# 保存结果
final_df.to_csv('data/r4n_smiles_c16_pubchem_validated.csv', index=False)