数据格式参考 `../data/seq-to-seq_datasets/uspto_50.pickle` `['reactants_mol', 'products_mol', 'reaction_type', 'set']`

In [29]:
import pandas as pd
from pathlib import Path
from rdkit.Chem import AllChem
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
import pickle
from collections import Counter

In [30]:
RAW = Path("data/data.csv")
OUT_PATH = Path("data/radicals")

In [31]:
df_raw = pd.read_csv(RAW)
print(len(df_raw))
df_raw.head()

17702


Unnamed: 0,id,mapped,confidence,rxn,reference,cls
0,0,O=N[C:2]([CH3:1])([C:3]([CH3:4])=[O:5])[C:6]([...,0.745358,CC(=O)C(C)(N=O)C(C)=O>>C[C](C(C)=O)C(C)=O,"Article; Li, Xin; Deng, Hui; Zhu, Xiao-Qing; W...",Others
1,1,[CH3:1][CH:2]([CH3:3])[C:4]([CH3:5])=[O:6]>>[C...,0.696944,CC(=O)C(C)C>>C[C](C)C(C)=O,"Article; Paul, Vikram; Roberts, Brian P.; Will...",Others
2,3,[CH3:1][C:2]([CH3:3])=[C:4]=[C:13]([CH3:14])[C...,0.174191,CC(C)=C=C(C)C.Clc1ccc(SSc2ccc(Cl)cc2)cc1>>C[C]...,"Article; Ito, Osamu; Journal of Organic Chemis...",Others
3,4,[CH3:1][C:2]([CH3:3])=[C:4]=[C:13]([CH3:14])[C...,0.19039,CC(C)=C=C(C)C.Cc1ccc(SSc2ccc(C)cc2)cc1>>C[C](C...,"Article; Ito, Osamu; Journal of Organic Chemis...",Others
4,5,Brc1ccc(S[S:5][c:6]2[cH:7][cH:8][c:9]([Br:10])...,0.217506,Brc1ccc(SSc2ccc(Br)cc2)cc1.CC(C)=C=C(C)C>>C[C]...,"Article; Ito, Osamu; Journal of Organic Chemis...",Others


In [32]:
reactants_mol = []
products_mol = []
rxn_types = []
for i, row in tqdm(df_raw.iterrows(), total=len(df_raw)):
    rxn = row["rxn"]
    prods = rxn.split(">>")[1]
    reacts = rxn.split(">>")[0]

    if len(prods.split(".")) != 1:
        print(f"The Num of Prods is not 1, skipped {prods}")
        continue
    try:
        mol_prods = AllChem.MolFromSmiles(prods)
        mol_reacts = AllChem.MolFromSmiles(reacts)

        if (mol_prods is None) or (mol_reacts is None):
            print(f"Failed to get mol, skipped {prods}")
            continue
        else:
            reactants_mol.append(mol_reacts)
            products_mol.append(mol_prods)
            rxn_types.append(row["cls"])
    except Exception as e:
        print(e)

  6%|▌         | 998/17702 [00:00<00:01, 9978.08it/s]

The Num of Prods is not 1, skipped Cl.O=Cc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5cccc(F)c5)c(Cl)c4)c3c2)o1
The Num of Prods is not 1, skipped Cl.NCc1ccc(-c2ccccc2O)o1
The Num of Prods is not 1, skipped O=C(O)c1ccoc1-c1ccccc1[N+](=O)[O-].O=C(O)c1coc(-c2ccccc2[N+](=O)[O-])c1
The Num of Prods is not 1, skipped O=C(c1ccccc1)c1ccc(-c2ccccc2)nc1.O=C(c1ccccc1)c1cnccc1-c1ccccc1
The Num of Prods is not 1, skipped N#Cc1ccnc(-c2ccccc2F)c1.N#Cc1ccncc1-c1ccccc1F
The Num of Prods is not 1, skipped CC(=O)N[C@@H](CC(C)C)c1cc(C)c2ccccc2n1.CC(=O)N[C@H](CC(C)C)c1cc(C)c2ccccc2n1
The Num of Prods is not 1, skipped CC(=O)c1cc(C(=O)c2ccccc2)c(Cl)nn1.CC(=O)c1nnc(Cl)cc1C(=O)c1ccccc1
The Num of Prods is not 1, skipped COc1cc(CNC(=O)c2ccccc2)c(Cl)nn1.COc1nnc(Cl)cc1CNC(=O)c1ccccc1
The Num of Prods is not 1, skipped c1ccc(-c2cc(C3COCCO3)cc(C3COCCO3)n2)cc1.c1ccc(-c2cc(C3COCCO3)ccn2)cc1.c1ccc(-c2cccc(C3COCCO3)n2)cc1
The Num of Prods is not 1, skipped N#Cc1ccc(-c2ccccc2)nc1.N#Cc1cccnc1-c1ccccc1.N#Cc1cnccc1-c1ccccc1
The Num of P

 11%|█▏        | 1996/17702 [00:00<00:01, 8630.18it/s]

The Num of Prods is not 1, skipped CN1C(=O)CCC1c1nccc2ccccc12.O=C1CCCN1Cc1nccc2ccccc12
The Num of Prods is not 1, skipped CCOC(=O)c1ccnc(-c2ccc(F)cc2F)c1.CCOC(=O)c1ccncc1-c1ccc(F)cc1F
The Num of Prods is not 1, skipped CC1(c2nc(C3(C)CCCCC3)c3nc[nH]c3n2)CCCCC1.CC1(c2ncnc3[nH]cnc23)CCCCC1
The Num of Prods is not 1, skipped O=C(c1ccccc1)c1nnc(C(=O)c2ccccc2)c2ccccc12.O=C(c1ccccc1)c1nncc2ccccc12
The Num of Prods is not 1, skipped CC(C)Oc1cc(CNC(=O)c2ccccc2)c(Cl)nn1.CC(C)Oc1nnc(Cl)cc1CNC(=O)c1ccccc1
The Num of Prods is not 1, skipped Cc1cc(C(=O)N(C)C)c2ccccc2n1.Cc1cc(CN(C)C=O)c2ccccc2n1
The Num of Prods is not 1, skipped COc1ccc(C(F)(F)c2ccc3ccccc3n2)cc1.COc1ccc(C(F)(F)c2ccnc3ccccc23)cc1
The Num of Prods is not 1, skipped c1ccc(-c2ccc3ccccc3n2)cc1.c1ccc(-c2ccnc3ccccc23)cc1
The Num of Prods is not 1, skipped COc1nn2ccnc2c(C(=O)c2ccccc2)c1C(=O)c1ccccc1.COc1nn2ccnc2cc1C(=O)c1ccccc1
The Num of Prods is not 1, skipped N#Cc1ccnc(-c2ccccc2)c1.N#Cc1ccncc1-c1ccccc1
The Num of Prods is not 1, skipped 

 16%|█▌        | 2871/17702 [00:00<00:01, 8114.39it/s]

The Num of Prods is not 1, skipped Nc1cc(C2CCCCC2)c(Cl)nn1.Nc1nnc(Cl)cc1C1CCCCC1
The Num of Prods is not 1, skipped c1cc2c(C3CCCCC3)nc(C3CCCCC3)nc2[nH]1.c1cc2cnc(C3CCCCC3)nc2[nH]1
The Num of Prods is not 1, skipped I.[CH3].[Ca]
The Num of Prods is not 1, skipped CCOC(=O)/C=C1/CC(c2ccccc2)(C(F)(F)F)O1.CCOC(=O)/C=C1\CC(c2ccccc2)(C(F)(F)F)O1
The Num of Prods is not 1, skipped CC(C)(C)N1C[C@]2(C)CCCOc3ccccc3[C@H]12.[O-][Cl+3]([O-])([O-])O
The Num of Prods is not 1, skipped Cc1ccc2c(c1)OCCC[C@@]1(C)CN(C(C)(C)C)[C@@H]21.[O-][Cl+3]([O-])([O-])O
The Num of Prods is not 1, skipped C[Si](C)(C)CNCCCC1=CC(=O)CCC1.Cl


 40%|███▉      | 7059/17702 [00:00<00:01, 8150.69it/s]

The Num of Prods is not 1, skipped CC1=C(CCl)C2=c3oc(=O)oc3=C3C(C[n+]4ccccc4)=C(C)SC3(C)C2(C)S1.[Cl-]
The Num of Prods is not 1, skipped [Cl-].c1ccc(-c2n3ccccc3c3cccc[n+]23)cc1
The Num of Prods is not 1, skipped Cc1ccc2c(c1)C(C)(C)C1(C=Cc3c(ccc4c3ccc[n+]4C)O1)N2C.[I-]
The Num of Prods is not 1, skipped CC1=CC2[B-]3(c4c(C)cc(C)cc4C)c4cc(C(C)(C)C)ccc4-c4ccc(C(C)(C)C)cc4C23C=C1.[K+]
The Num of Prods is not 1, skipped CN[N+](C)(C)C(c1ccc(N2CCOCC2)cc1)c1ccc(N2CCOCC2)cc1.F[B-](F)(F)F
The Num of Prods is not 1, skipped C=CCN1c2ccccc2C(C)(C)C12C=Cc1c(ccc3c1ccc[n+]3C)O2.[I-]
The Num of Prods is not 1, skipped CCCN1c2ccccc2C(C)(C)C12C=Cc1c(ccc3c1ccc[n+]3C)O2.[I-]
The Num of Prods is not 1, skipped CC1=CC2(C)[B-]3(c4c(C)cc(C)cc4C)c4ccccc4-c4ccccc4C32C(C)=C1.[K+]
The Num of Prods is not 1, skipped CC1=CC2(C)[B-]3(c4c(C)cc(C)cc4C)c4cc(C(C)(C)C)ccc4-c4ccc(C(C)(C)C)cc4C32C(C)=C1.[K+]
The Num of Prods is not 1, skipped CC1=CC2[B-]3(c4c(C)cc(C)cc4C)c4ccccc4-c4ccccc4C23C=C1.[K+]
The Num of Prods is not 

 49%|████▉     | 8655/17702 [00:01<00:01, 7396.69it/s]

The Num of Prods is not 1, skipped C=C.COC(Cc1ccccc1)c1ccccc1
The Num of Prods is not 1, skipped [O-][Cl+3]([O-])([O-])[O-].c1ccc(C(CN2CCCCC2)[n+]2ccccc2)cc1


 73%|███████▎  | 12939/17702 [00:01<00:00, 6775.16it/s]

The Num of Prods is not 1, skipped Cc1ccc(C(CCCCC#N)c2ccc3ccccc3c2)cc1.O=C=O
The Num of Prods is not 1, skipped CC#N.CC(C)(C)O[K].Cl[Cu].O=c1oc2ccccc2cc1-c1ccc(C#Cc2ccccc2)cc1


 83%|████████▎ | 14625/17702 [00:01<00:00, 7617.64it/s]

The Num of Prods is not 1, skipped CN[N+](C)(C)C(c1ccc(N(C)CC(F)(F)F)cc1)c1ccc(N(C)CC(F)(F)F)cc1.F[B-](F)(F)F
The Num of Prods is not 1, skipped CN(CC(F)(F)F)c1ccc(C(c2ccc(N(C)CC(F)(F)F)cc2)[N+](C)(C)N)cc1.F[B-](F)(F)F
The Num of Prods is not 1, skipped C[Si](C)(C)C[CH+]CC(c1ccccc1)c1ccccc1.[Cl-]
The Num of Prods is not 1, skipped C[Si](C)(C)C[CH+]CC(c1ccccc1)c1ccccc1.[Br-]
The Num of Prods is not 1, skipped C[n+]1ccc(C=Cc2ccccc2)cc1.[I-]
The Num of Prods is not 1, skipped C[n+]1ccc(C=Cc2ccccc2)cc1.[I-]
The Num of Prods is not 1, skipped C[n+]1ccc(C=Cc2ccccc2)cc1.[I-]
The Num of Prods is not 1, skipped COS(=O)(=O)[O-].C[n+]1ccc(C=Cc2ccccc2)cc1
The Num of Prods is not 1, skipped COS(=O)(=O)[O-].C[n+]1ccc(C=Cc2ccccc2)cc1
The Num of Prods is not 1, skipped COS(=O)(=O)[O-].C[n+]1ccc(C=Cc2ccccc2)cc1
The Num of Prods is not 1, skipped CCCN1c2ccc(Br)cc2C(C)(C)C12C=Cc1c(ccc3c1ccc[n+]3C)O2.[I-]


 97%|█████████▋| 17091/17702 [00:02<00:00, 7285.00it/s]

The Num of Prods is not 1, skipped C=C[C@@H](O)CC[C@H](CCCCC[C@H](C)O[Si](c1ccccc1)(c1ccccc1)C(C)(C)C)OCOC.COCO[C@@H](CCCCC[C@H](C)O[Si](c1ccccc1)(c1ccccc1)C(C)(C)C)CC[C@@H](O)[C@@H]1CO1
The Num of Prods is not 1, skipped C=CC[C@@H]1[C@@]2(COC)CC[C@H](O[Si](C)(C)C(C)(C)C)[C@]1(COC(C)=O)CN(C(=O)OC(C)(C)C)C2.C=CC[C@H]1[C@@]2(COC)CC[C@H](O[Si](C)(C)C(C)(C)C)[C@]1(COC(C)=O)CN(C(=O)OC(C)(C)C)C2
The Num of Prods is not 1, skipped C=CC[C@@H](O)c1ccc(Cl)cc1.C=CC[C@H](O)c1ccc(Cl)cc1
The Num of Prods is not 1, skipped C=C(CC)C[C@@H](O)C[C@H]1COC(CC)(CC)O1.C=C(CC)C[C@H](O)C[C@H]1COC(CC)(CC)O1
The Num of Prods is not 1, skipped C=CC[C@@H](O)C[C@@H]1C[C@H](O[Si](C)(C)C(C)(C)C)C[C@H](/C=C/c2ccccc2)O1.C=CC[C@H](O)C[C@@H]1C[C@H](O[Si](C)(C)C(C)(C)C)C[C@H](/C=C/c2ccccc2)O1
The Num of Prods is not 1, skipped CC(=O)[CH]C(C)=O.[K+]
The Num of Prods is not 1, skipped CC(=O)[CH]C(C)=O.[K+]
The Num of Prods is not 1, skipped O=S(=O)([O-])c1ccccc1[C](c1ccccc1)c1ccccc1.[Li+]
The Num of Prods is not 1, skipped 

100%|██████████| 17702/17702 [00:02<00:00, 7472.79it/s]

The Num of Prods is not 1, skipped C=CCN(C12CC(C1)C2)S(=O)(=O)c1ccc(C)cc1.C=CCN(C12CC(CCC(=O)OC)(C1)C2)S(=O)(=O)c1ccc(C)cc1
The Num of Prods is not 1, skipped C=CCN(C12CC(C1)C2)S(=O)(=O)c1ccc(C)cc1.C=CCN(C12CC(CC(CCC(=O)OC)C(=O)OC)(C1)C2)S(=O)(=O)c1ccc(C)cc1.C=CCN(C12CC(CCC(=O)OC)(C1)C2)S(=O)(=O)c1ccc(C)cc1
The Num of Prods is not 1, skipped CCOc1ccc(-c2ccc(C(CC(=O)OC)C(=O)OC)cc2)cc1.CCOc1ccc(-c2ccccc2)cc1
The Num of Prods is not 1, skipped CCOc1ccc(-c2ccc(CCC(=O)OC)cc2)cc1.CCOc1ccc(-c2ccccc2)cc1
The Num of Prods is not 1, skipped CCOc1ccc(-c2ccc(CCC(N)=O)cc2)cc1.CCOc1ccc(-c2ccccc2)cc1
The Num of Prods is not 1, skipped CCOc1ccc(-c2ccc(CCC=O)cc2)cc1.CCOc1ccc(-c2ccccc2)cc1





In [33]:
print(len(reactants_mol), len(products_mol), len(rxn_types))

17593 17593 17593


In [34]:
df = pd.DataFrame(
    {
        "reactants_mol": reactants_mol,
        "products_mol": products_mol,
    }
)

In [35]:
skf = StratifiedKFold(shuffle=True, random_state=42)


def to_csv(df, path, mode="train"):
    df["set"] = mode
    df.to_pickle(path, index=None)
    print(f"write into {path}")


for i, (train_idx, test_idx) in enumerate(skf.split(df, rxn_types)):
    FOLD = OUT_PATH / f"radicals_fold{i}.pkl"
    extra_test_src = OUT_PATH / f"radicals_fold{i}_test_src.txt"
    extra_test_tgt = OUT_PATH / f"radicals_fold{i}_test_tgt.txt"

    # test 再分成 valid 和 test
    val_idx, test_idx = train_test_split(test_idx, train_size=0.5, random_state=42)
    with open(f"tmp/idx_fold{i}.pkl", "wb") as f:
        pickle.dump(test_idx, f)
    df.loc[train_idx, "set"] = "train"
    df.loc[val_idx, "set"] = "valid"
    df.loc[test_idx, "set"] = "test"

    df["reaction_type"] = rxn_types
    print(Counter(df["set"]))
    df.to_pickle(FOLD)

    df_test = df[df["set"] == "test"]
    print(f"write extra test csv file to {extra_test_src} and {extra_test_tgt}")
    pd.DataFrame(
        {"smi": [AllChem.MolToSmiles(mol) for mol in df_test["products_mol"]]}
    ).to_csv(extra_test_tgt, index=None, header=None)
    pd.DataFrame(
        {"smi": [AllChem.MolToSmiles(mol) for mol in df_test["reactants_mol"]]}
    ).to_csv(extra_test_src, index=None, header=None)

Counter({'train': 14074, 'test': 1760, 'valid': 1759})
write extra test csv file to data/radicals/radicals_fold0_test_src.txt and data/radicals/radicals_fold0_test_tgt.txt
Counter({'train': 14074, 'test': 1760, 'valid': 1759})
write extra test csv file to data/radicals/radicals_fold1_test_src.txt and data/radicals/radicals_fold1_test_tgt.txt
Counter({'train': 14074, 'test': 1760, 'valid': 1759})
write extra test csv file to data/radicals/radicals_fold2_test_src.txt and data/radicals/radicals_fold2_test_tgt.txt
Counter({'train': 14075, 'test': 1759, 'valid': 1759})
write extra test csv file to data/radicals/radicals_fold3_test_src.txt and data/radicals/radicals_fold3_test_tgt.txt
Counter({'train': 14075, 'test': 1759, 'valid': 1759})
write extra test csv file to data/radicals/radicals_fold4_test_src.txt and data/radicals/radicals_fold4_test_tgt.txt
