In [1]:
# 读取smiles
import pandas as pd

filename = 'r4n_smiles_c20'
df = pd.read_csv(f'data/{filename}_pubchem_validated.csv')
smiles = df['SMILES']

smiles.head()

0        C[N+](C)(C)C
1       CC[N+](C)(C)C
2    CC(C)[N+](C)(C)C
3      CCC[N+](C)(C)C
4      CC[N+](C)(C)CC
Name: SMILES, dtype: object

In [2]:
# 将 R4N+ smiles 的转化为溴化盐或者氯化盐
from rdkit import Chem

def add_halide_to_smiles(smiles, halide='Br'):
    """
    将季铵阳离子 SMILES 转化为卤化盐
    
    Parameters:
    smiles: str - 包含 R4N+ 的 SMILES 字符串
    halide: str - 'Br' 为溴化盐, 'Cl' 为氯化盐
    
    Returns:
    str - 添加了卤素阴离子的 SMILES
    """
    # 在 SMILES 末尾添加卤素阴离子
    if halide == 'Br':
        return smiles + '.[Br-]'
    elif halide == 'Cl':
        return smiles + '.[Cl-]'
    else:
        raise ValueError("halide 必须是 'Br' 或 'Cl'")

# 创建溴化盐和氯化盐的 SMILES
df['SMILES_Bromide'] = smiles.apply(lambda x: add_halide_to_smiles(x, 'Br'))
df['SMILES_Chloride'] = smiles.apply(lambda x: add_halide_to_smiles(x, 'Cl'))

# 显示结果
print("原始 SMILES (前5条):")
print(df[['SMILES']].head())
print("\n溴化盐 SMILES (前5条):")
print(df[['SMILES_Bromide']].head())
print("\n氯化盐 SMILES (前5条):")
print(df[['SMILES_Chloride']].head())

# 验证分子结构
print("\n验证第一个分子的结构:")
mol_bromide = Chem.MolFromSmiles(df['SMILES_Bromide'].iloc[0])
mol_chloride = Chem.MolFromSmiles(df['SMILES_Chloride'].iloc[0])
print(f"溴化盐有效: {mol_bromide is not None}")
print(f"氯化盐有效: {mol_chloride is not None}")

原始 SMILES (前5条):
             SMILES
0      C[N+](C)(C)C
1     CC[N+](C)(C)C
2  CC(C)[N+](C)(C)C
3    CCC[N+](C)(C)C
4    CC[N+](C)(C)CC

溴化盐 SMILES (前5条):
           SMILES_Bromide
0      C[N+](C)(C)C.[Br-]
1     CC[N+](C)(C)C.[Br-]
2  CC(C)[N+](C)(C)C.[Br-]
3    CCC[N+](C)(C)C.[Br-]
4    CC[N+](C)(C)CC.[Br-]

氯化盐 SMILES (前5条):
          SMILES_Chloride
0      C[N+](C)(C)C.[Cl-]
1     CC[N+](C)(C)C.[Cl-]
2  CC(C)[N+](C)(C)C.[Cl-]
3    CCC[N+](C)(C)C.[Cl-]
4    CC[N+](C)(C)CC.[Cl-]

验证第一个分子的结构:
溴化盐有效: True
氯化盐有效: True


In [3]:
# 合并溴化盐和氯化盐的 SMILES 为一列
# 创建溴化盐数据框
df_bromide = df[['Index', 'Num_c', 'SMILES', 'SMILES_Bromide']].copy()
df_bromide['Halide_Type'] = 'Bromide'
df_bromide['SMILES_Salt'] = df_bromide['SMILES_Bromide']
df_bromide = df_bromide[['Index', 'Num_c', 'SMILES', 'SMILES_Salt', 'Halide_Type']]

# 创建氯化盐数据框
df_chloride = df[['Index', 'Num_c', 'SMILES', 'SMILES_Chloride']].copy()
df_chloride['Halide_Type'] = 'Chloride'
df_chloride['SMILES_Salt'] = df_chloride['SMILES_Chloride']
df_chloride = df_chloride[['Index', 'Num_c', 'SMILES', 'SMILES_Salt', 'Halide_Type']]

# 合并两个数据框
df_combined = pd.concat([df_bromide, df_chloride], ignore_index=True)

# 按照 Index 和 Halide_Type 排序
df_combined = df_combined.sort_values(by=['Index', 'Halide_Type']).reset_index(drop=True)

# 显示结果
print(f"合并后的数据总行数: {len(df_combined)}")
print(f"原始数据行数: {len(df)}")
print(f"\n合并后的数据 (前10条):")
print(df_combined.head(10))

# 保存合并后的数据
output_filename = f'data/{filename}_salts_combined.csv'
df_combined.to_csv(output_filename, index=False)
print(f"\n已保存到: {output_filename}")

合并后的数据总行数: 552
原始数据行数: 276

合并后的数据 (前10条):
   Index  Num_c            SMILES             SMILES_Salt Halide_Type
0      1      4      C[N+](C)(C)C      C[N+](C)(C)C.[Br-]     Bromide
1      1      4      C[N+](C)(C)C      C[N+](C)(C)C.[Cl-]    Chloride
2      2      5     CC[N+](C)(C)C     CC[N+](C)(C)C.[Br-]     Bromide
3      2      5     CC[N+](C)(C)C     CC[N+](C)(C)C.[Cl-]    Chloride
4      3      6  CC(C)[N+](C)(C)C  CC(C)[N+](C)(C)C.[Br-]     Bromide
5      3      6  CC(C)[N+](C)(C)C  CC(C)[N+](C)(C)C.[Cl-]    Chloride
6      4      6    CCC[N+](C)(C)C    CCC[N+](C)(C)C.[Br-]     Bromide
7      4      6    CCC[N+](C)(C)C    CCC[N+](C)(C)C.[Cl-]    Chloride
8      5      6    CC[N+](C)(C)CC    CC[N+](C)(C)CC.[Br-]     Bromide
9      5      6    CC[N+](C)(C)CC    CC[N+](C)(C)CC.[Cl-]    Chloride

已保存到: data/r4n_smiles_c20_salts_combined.csv


In [None]:
# 在pubchempy中验证这些化合物,要求返回能查询到CAS号的化合物smiles
import pubchempy as pcp
from typing import Optional

def validate_and_get_cas(smiles, verbose: bool = False, idx: Optional[int] = None, total: Optional[int] = None):
    """验证SMILES并获取CAS号
    参数:
        smiles: str - SMILES字符串
        verbose: bool - 是否实时打印查询进度与结果
        idx: int - 当前序号(可选)
        total: int - 总数(可选)
    返回:
        dict: 包含SMILES、CAS号等信息
    """
    prefix = f"[{idx}/{total}] " if (idx is not None and total is not None) else ""
    
    try:
        if verbose:
            print(f"{prefix}查询 PubChem: {smiles}", flush=True)
        
        results = pcp.get_compounds(smiles, 'smiles')
        
        if results:
            compound = results[0]
            synonyms = compound.synonyms if compound.synonyms else []
            
            # 筛选出 CAS 号格式的同义词 (格式: xxx-xx-x)
            cas_numbers = [s for s in synonyms if '-' in s and all(part.isdigit() for part in s.split('-'))]
            
            if cas_numbers:
                cas_str = ', '.join(cas_numbers)
                if verbose:
                    print(f"{prefix}✓ 找到 CAS 号: {cas_numbers[0]}", flush=True)
                return {
                    'smiles': smiles,
                    'exists': True,
                    'cid': compound.cid,
                    'name': compound.iupac_name,
                    'molecular_weight': compound.molecular_weight,
                    'cas': cas_str
                }
        
        # 未找到
        if verbose:
            print(f"{prefix}未找到 CAS 号", flush=True)
        return {'smiles': smiles, 'exists': False, 'cas': ''}
        
    except Exception as e:
        if verbose:
            print(f"{prefix}查询出错: {e}", flush=True)
        return {'smiles': smiles, 'exists': False, 'cas': '', 'error': str(e)}

# 准备SMILES列表和对应的元数据
smiles_list = df_combined['SMILES_Salt'].tolist()
indices = df_combined['Index'].tolist()
halide_types = df_combined['Halide_Type'].tolist()

print("开始验证化合物并查询 CAS 号...")
print(f"总共需要验证 {len(smiles_list)} 个化合物\n")

# 实时输出:传入 idx 和 total,并开启 verbose
results = [
    validate_and_get_cas(smiles, verbose=True, idx=i, total=len(smiles_list))
    for i, smiles in enumerate(smiles_list, start=1)
]

# 将结果添加到数据框
results_df = pd.DataFrame(results)
df_combined['CAS'] = results_df['cas'].tolist()

# 筛选出有 CAS 号的化合物
df_with_cas = df_combined[df_combined['CAS'] != ''].copy()

print(f"\n{'='*60}")
print(f"验证完成!")
print(f"总化合物数: {len(df_combined)}")
print(f"有 CAS 号的化合物数: {len(df_with_cas)}")
print(f"{'='*60}\n")

# 显示有 CAS 号的化合物
print(f"有 CAS 号的化合物 (前10条):")
print(df_with_cas.head(10))

# 保存有 CAS 号的化合物
output_filename_validated = f'data/{filename}_salts_with_cas.csv'
df_with_cas.to_csv(output_filename_validated, index=False)
print(f"\n已保存有 CAS 号的化合物到: {output_filename_validated}")

# 也保存包含所有化合物和 CAS 信息的完整数据
output_filename_all = f'data/{filename}_salts_all_with_cas_info.csv'
df_combined.to_csv(output_filename_all, index=False)
print(f"已保存完整数据到: {output_filename_all}")

开始验证化合物并查询 CAS 号...
总共需要验证 552 个化合物

[1/552] 查询 PubChem: C[N+](C)(C)C.[Br-]
[1/552] 未找到 CAS 号
[2/552] 查询 PubChem: C[N+](C)(C)C.[Cl-]
[1/552] 未找到 CAS 号
[2/552] 查询 PubChem: C[N+](C)(C)C.[Cl-]
[2/552] 未找到 CAS 号
[3/552] 查询 PubChem: CC[N+](C)(C)C.[Br-]
[2/552] 未找到 CAS 号
[3/552] 查询 PubChem: CC[N+](C)(C)C.[Br-]
[3/552] 未找到 CAS 号
[4/552] 查询 PubChem: CC[N+](C)(C)C.[Cl-]
[3/552] 未找到 CAS 号
[4/552] 查询 PubChem: CC[N+](C)(C)C.[Cl-]
[4/552] 未找到 CAS 号
[5/552] 查询 PubChem: CC(C)[N+](C)(C)C.[Br-]
[4/552] 未找到 CAS 号
[5/552] 查询 PubChem: CC(C)[N+](C)(C)C.[Br-]
[5/552] 未找到 CAS 号
[6/552] 查询 PubChem: CC(C)[N+](C)(C)C.[Cl-]
[5/552] 未找到 CAS 号
[6/552] 查询 PubChem: CC(C)[N+](C)(C)C.[Cl-]
[6/552] 未找到 CAS 号
[7/552] 查询 PubChem: CCC[N+](C)(C)C.[Br-]
[6/552] 未找到 CAS 号
[7/552] 查询 PubChem: CCC[N+](C)(C)C.[Br-]
[7/552] 未找到 CAS 号
[8/552] 查询 PubChem: CCC[N+](C)(C)C.[Cl-]
[7/552] 未找到 CAS 号
[8/552] 查询 PubChem: CCC[N+](C)(C)C.[Cl-]


KeyboardInterrupt: 