In [1]:
import pandas as pd

df = pd.read_csv("./data/train.csv")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10537 entries, 0 to 10536
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ChemID               10537 non-null  int64  
 1   source_casrn         10177 non-null  object 
 2   CAS                  10537 non-null  object 
 3   SMILES               10537 non-null  object 
 4   NAME                 10537 non-null  object 
 5   preferred_name       5808 non-null   object 
 6   iupac                5738 non-null   object 
 7   MPID                 10537 non-null  object 
 8   dsstox_substance_id  10182 non-null  object 
 9   Canonical_QSARr      10537 non-null  object 
 10  InChI_Code_QSARr     10537 non-null  object 
 11  InChI Key_QSARr      10537 non-null  object 
 12  Salt_Solvent         10530 non-null  object 
 13  Salt_Solvent_ID      56 non-null     float64
 14  Kow Reference        10530 non-null  object 
 15  LogP                 10537 non-null 

In [13]:
df.head()

Unnamed: 0,ChemID,CAS,SMILES,NAME,MPID,Canonical_QSARr,InChI_Code_QSARr,InChI Key_QSARr,LogP
0,100001,50-00-0,[H]C([H])=O,FORMALDEHYDE,4675,C=O,InChI=1S/CH2O/c1-2/h1H2,WSFSSNUMVMOOMR-UHFFFAOYSA-N,0.35
1,100002,50-02-2|378-44-9,[H]OC([H])([H])C(=O)C1(O[H])C([H])(C([H])([H])...,"DEXAMETHASONE, BETAMETHASONE",4676|6480,CC1CC2C3CCC4=CC(=O)C=CC4(C)C3(F)C(O)CC2(C)C1(O...,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,UREBDLICKHMUKA-UHFFFAOYSA-N,1.83
2,100003,50-03-3,[H]OC1([H])C([H])([H])C2(C([H])([H])[H])C(O[H]...,HYDROCORTISONE ACETATE,4677,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...,InChI=1S/C23H32O6/c1-13(24)29-12-19(27)23(28)9...,ALEXXDVDDISNDU-UHFFFAOYSA-N,2.19
3,100005,50-06-6,[H]C1=C([H])C([H])=C(C2(C([H])([H])C([H])([H])...,PHENOBARBITAL,4679,CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,DDBREPKUVSBGFI-UHFFFAOYSA-N,1.47
4,100006,50-07-7,[H]N([H])C(=O)OC([H])([H])C1([H])C2=C(C(=O)C(C...,MITOMYCIN C,4680,CC1C(=O)C2=C(C(COC(N)=O)C3(OC)C4NC4CN32)C(=O)C=1N,InChI=1S/C15H18N4O5/c1-5-9(16)12(21)8-6(4-24-1...,NWIBSHFKIJFRCO-UHFFFAOYSA-N,-0.4


In [None]:
df = df.dropna(axis=1, how='any') # 결측치 다 제거거

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10537 entries, 0 to 10536
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ChemID            10537 non-null  int64  
 1   CAS               10537 non-null  object 
 2   SMILES            10537 non-null  object 
 3   NAME              10537 non-null  object 
 4   MPID              10537 non-null  object 
 5   Canonical_QSARr   10537 non-null  object 
 6   InChI_Code_QSARr  10537 non-null  object 
 7   InChI Key_QSARr   10537 non-null  object 
 8   LogP              10537 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 741.0+ KB


In [123]:
import pubchempy as pcp

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns



from mordred import Calculator, descriptors

from mordred.HydrogenBond import HBondDonor, HBondAcceptor
from mordred.RingCount import RingCount
from mordred.Weight import Weight
from mordred.SLogP import SLogP
from mordred.TopoPSA import TopoPSA
from mordred.Aromatic import AromaticAtomsCount
from mordred.RotatableBond import RotatableBondsCount
from mordred.KappaShapeIndex import KappaShapeIndex1, KappaShapeIndex2, KappaShapeIndex3
from mordred.BalabanJ import BalabanJ
from mordred.ZagrebIndex import ZagrebIndex
from mordred.BertzCT import BertzCT

In [125]:
important_desc = [
    HBondDonor(),
    HBondAcceptor(),
    RingCount(),
    Weight(),
    TopoPSA()
]

extra_desc = [
    SLogP(),
    AromaticAtomsCount(),
    RotatableBondsCount(),
    KappaShapeIndex1(),
    KappaShapeIndex2(),
    KappaShapeIndex3(),
    BalabanJ(),
    ZagrebIndex(),
    BertzCT(),
]

descriptor_list = important_desc + extra_desc

records = list()
calc = Calculator(descriptor_list, ignore_3D=True)

In [127]:
for idx, row in df.iterrows():
    smiles = row["SMILES"]

    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        print("there is no smiles matching")
        continue

    desc_values = calc(mol).asdict()
    desc_values["Name"] = row["Name"]
    desc_values["SMILES"] = smiles

    records.append(desc_values)

    if idx % 500 == 0:
        print(f"[INFO] Processed {idx} / {len(df)} molecules")


desc_df = pd.DataFrame(records)

[INFO] Processed 0 / 10534 molecules
[INFO] Processed 500 / 10534 molecules
[INFO] Processed 1000 / 10534 molecules


[23:05:17] Explicit valence for atom # 7 N, 4, is greater than permitted


there is no smiles matching
[INFO] Processed 1500 / 10534 molecules
[INFO] Processed 2000 / 10534 molecules


[23:05:21] Explicit valence for atom # 7 N, 4, is greater than permitted


there is no smiles matching
[INFO] Processed 2500 / 10534 molecules
[INFO] Processed 3000 / 10534 molecules
[INFO] Processed 3500 / 10534 molecules


[23:05:27] Explicit valence for atom # 13 N, 4, is greater than permitted


there is no smiles matching
there is no smiles matching


[23:05:28] Explicit valence for atom # 6 N, 4, is greater than permitted


[INFO] Processed 4000 / 10534 molecules
[INFO] Processed 4500 / 10534 molecules


[23:05:32] Explicit valence for atom # 7 N, 4, is greater than permitted


there is no smiles matching
[INFO] Processed 5000 / 10534 molecules
[INFO] Processed 5500 / 10534 molecules
[INFO] Processed 6000 / 10534 molecules
[INFO] Processed 6500 / 10534 molecules
[INFO] Processed 7000 / 10534 molecules


[23:05:46] Explicit valence for atom # 9 N, 4, is greater than permitted
[23:05:46] Explicit valence for atom # 7 N, 4, is greater than permitted


there is no smiles matching
there is no smiles matching
[INFO] Processed 7500 / 10534 molecules
[INFO] Processed 8000 / 10534 molecules


[23:05:50] Explicit valence for atom # 5 N, 4, is greater than permitted
[23:05:51] Explicit valence for atom # 10 N, 4, is greater than permitted


there is no smiles matching
there is no smiles matching
[INFO] Processed 8500 / 10534 molecules


[23:05:52] Explicit valence for atom # 6 N, 4, is greater than permitted


there is no smiles matching
[INFO] Processed 9000 / 10534 molecules
[INFO] Processed 9500 / 10534 molecules
[INFO] Processed 10000 / 10534 molecules
[INFO] Processed 10500 / 10534 molecules


In [128]:
df.rename(columns={'NAME' : 'Name'}, inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10534 entries, 0 to 10533
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ChemID            10534 non-null  int64  
 1   CAS               10534 non-null  object 
 2   SMILES            10534 non-null  object 
 3   Name              10534 non-null  object 
 4   MPID              10534 non-null  object 
 5   Canonical_QSARr   10534 non-null  object 
 6   InChI_Code_QSARr  10534 non-null  object 
 7   InChI Key_QSARr   10534 non-null  object 
 8   LogP              10534 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 740.8+ KB
None


In [129]:
dups = df["SMILES"].duplicated().sum()
print(f"중복된 SMILES 개수: {dups}")
df = df.drop_duplicates(subset=["SMILES"]).reset_index(drop=True)

중복된 SMILES 개수: 0


In [130]:
desc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10524 entries, 0 to 10523
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   nHBDon       10524 non-null  int64  
 1   nHBAcc       10524 non-null  int64  
 2   nRing        10524 non-null  int64  
 3   MW           10524 non-null  float64
 4   TopoPSA(NO)  10524 non-null  float64
 5   SLogP        10524 non-null  float64
 6   nAromAtom    10524 non-null  int64  
 7   nRot         10524 non-null  int64  
 8   Kier1        10524 non-null  float64
 9   Kier2        10524 non-null  object 
 10  Kier3        10524 non-null  object 
 11  BalabanJ     10524 non-null  float64
 12  Zagreb1      10524 non-null  float64
 13  BertzCT      10524 non-null  float64
 14  Name         10524 non-null  object 
 15  SMILES       10524 non-null  object 
dtypes: float64(7), int64(5), object(4)
memory usage: 1.3+ MB


In [141]:
print(f"원본 df 개수: {len(df)}")
print(f"유효 SMILES 개수: {sum(df['SMILES'].notna())}")

print(f"desc_df shape: {desc_df.shape}")
print(f"records 개수: {len(records)}")

# 중복된 Canonical_SMILES 있는지 확인
dups = desc_df["Name"].duplicated().sum()
print(f"중복된 Name 개수: {dups}")
desc_df = desc_df.drop_duplicates(subset=["Name", 'SMILES']).reset_index(drop=True)
df = df.drop_duplicates(subset=['Name', 'SMILES']).reset_index(drop=True)



원본 df 개수: 10534
유효 SMILES 개수: 10534
desc_df shape: (10297, 16)
records 개수: 10524
중복된 Name 개수: 0


In [142]:
desc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10297 entries, 0 to 10296
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   nHBDon       10297 non-null  int64  
 1   nHBAcc       10297 non-null  int64  
 2   nRing        10297 non-null  int64  
 3   MW           10297 non-null  float64
 4   TopoPSA(NO)  10297 non-null  float64
 5   SLogP        10297 non-null  float64
 6   nAromAtom    10297 non-null  int64  
 7   nRot         10297 non-null  int64  
 8   Kier1        10297 non-null  float64
 9   Kier2        10297 non-null  object 
 10  Kier3        10297 non-null  object 
 11  BalabanJ     10297 non-null  float64
 12  Zagreb1      10297 non-null  float64
 13  BertzCT      10297 non-null  float64
 14  Name         10297 non-null  object 
 15  SMILES       10297 non-null  object 
dtypes: float64(7), int64(5), object(4)
memory usage: 1.3+ MB


In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10534 entries, 0 to 10533
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ChemID            10534 non-null  int64  
 1   CAS               10534 non-null  object 
 2   SMILES            10534 non-null  object 
 3   Name              10534 non-null  object 
 4   MPID              10534 non-null  object 
 5   Canonical_QSARr   10534 non-null  object 
 6   InChI_Code_QSARr  10534 non-null  object 
 7   InChI Key_QSARr   10534 non-null  object 
 8   LogP              10534 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 740.8+ KB


In [146]:
#merge
df_final = pd.merge(desc_df, df, on="Name", how="outer")

print(f"[DONE] Final shape: {df_final.shape}")

[DONE] Final shape: (10534, 24)


In [145]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10534 entries, 0 to 10533
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ChemID            10534 non-null  int64  
 1   CAS               10534 non-null  object 
 2   SMILES_x          10534 non-null  object 
 3   Name              10534 non-null  object 
 4   MPID              10534 non-null  object 
 5   Canonical_QSARr   10534 non-null  object 
 6   InChI_Code_QSARr  10534 non-null  object 
 7   InChI Key_QSARr   10534 non-null  object 
 8   LogP              10534 non-null  float64
 9   nHBDon            10524 non-null  float64
 10  nHBAcc            10524 non-null  float64
 11  nRing             10524 non-null  float64
 12  MW                10524 non-null  float64
 13  TopoPSA(NO)       10524 non-null  float64
 14  SLogP             10524 non-null  float64
 15  nAromAtom         10524 non-null  float64
 16  nRot              10524 non-null  float6

---

In [62]:
df_missing = df_final[df_final.isna().any(axis=1)]

In [64]:
df_missing

Unnamed: 0,ChemID,CAS,SMILES_x,Name,MPID,Canonical_QSARr,InChI_Code_QSARr,InChI Key_QSARr,LogP,nHBDon,nHBAcc,nRing,MW,TopoPSA(NO),SMILES_y
1747,112271,NOCAS_879247,[H]C1C(N(C([H])([H])C([H])([H])[H])C([H])([H])...,"2,4-DiNH2-6-diEtAm-pyrimidine-3-oxide",15884,CCN(CC)C1CC(N)N(=O)C(N)N1,InChI=1S/C8H20N5O/c1-3-12(4-2)7-5-6(9)13(14)8(...,JWBVUUNANBAIIC-UHFFFAOYSA-N,1.16,,,,,,
3503,105488,14073-00-8,[H]C1C([H])C([H])C2C(C1[H])C([N+](=O)[O-])C(C(...,3-METHYL-4-NITROQUINOLINE-1-OXIDE,9868,CC1CN(=O)C2CCCCC2C1[N+]([O-])=O,InChI=1S/C10H17N2O3/c1-7-6-11(13)9-5-3-2-4-8(9...,DGBIVXRSPRWDSZ-UHFFFAOYSA-N,1.06,,,,,,
4438,105582,14906-59-3,[H]C1C(C#N)C([H])C([H])N(=O)C1[H],4-CYANOPYRIDINE OXIDE,9944,N#CC1CCN(=O)CC1,"InChI=1S/C6H9N2O/c7-5-6-1-3-8(9)4-2-6/h6H,1-4H2",MRSXJNPHXOSDAJ-UHFFFAOYSA-N,-0.94,,,,,,
5197,112230,NOCAS_877618,[H]C1NC2NC([H])N(=O)C(SC([H])([H])C([H])([H])C...,"7H-Purine, 6-[[5-[(2-ethoxy-2-oxoethyl)amino]pen",15849,CCOC(=O)CNCCCCCSC1C2NCNC2NCN1=O,InChI=1S/C14H28N5O3S/c1-2-22-11(20)8-15-6-4-3-...,DTFPRFJXWQUHSU-UHFFFAOYSA-N,0.87,,,,,,
6343,101997,480-96-6,[H]C1C([H])C([H])C2C(NON2=O)C1[H],"BENZOFURAZAN, 1-OXIDE",6609,O=N1ONC2CCCCC12,InChI=1S/C6H11N2O2/c9-8-6-4-2-1-3-5(6)7-10-8/h...,AJFSNCPFXDYNDP-UHFFFAOYSA-N,1.43,,,,,,
6570,113315,6141-98-6,[H]C1C([H])C([H])C2C(NN(=O)C3C([H])C([H])C([H]...,BNEZOCCINNOLINENOXIDE,16720,O=N1NC2CCCCC2C2CCCCC12,InChI=1S/C12H21N2O/c15-14-12-8-4-2-6-10(12)9-5...,IUTMFPAQLDGHQY-UHFFFAOYSA-N,2.24,,,,,,
7930,106711,29945-54-8,[H]C1=C([H])C([H])=C(C2NON(=O)C2N([H])[H])C([H...,"FURAZANAMINE, 4-PHENYL-, 2-OXIDE",10972,NC1C(NON1=O)c1ccccc1,InChI=1S/C8H10N3O2/c9-8-7(10-13-11(8)12)6-4-2-...,RYUYTHBIMVPHMG-UHFFFAOYSA-N,1.42,,,,,,
7935,110477,104151-78-2,[H]C([H])([H])OC(=O)C1C(C([H])([H])[H])NON1=O,"FURAZANCARBOXYLIC ACID, 4-METHYL-, METHYL ESTER,",14279,CC1NON(=O)C1C(=O)OC,InChI=1S/C5H9N2O4/c1-3-4(5(8)10-2)7(9)11-6-3/h...,YJSROLATKFZNLP-UHFFFAOYSA-N,0.69,,,,,,
7936,110478,104151-90-8,[H]C([H])([H])OC(=O)C1NON(=O)C1C([H])([H])[H],"FURAZANCARBOXYLIC ACID, 4-METHYL-, METHYL ESTER,",14280,CC1C(NON1=O)C(=O)OC,InChI=1S/C5H9N2O4/c1-3-4(5(8)10-2)6-11-7(3)9/h...,FXWYXKMXQZUWMX-UHFFFAOYSA-N,0.56,,,,,,
10764,103226,1131-61-9,[H]C1=C([H])C([H])=C(C2C([H])C([H])N(=O)C([H])...,PYRIDINE-1-OXIDE-4-PHENYL,7784,O=N1CCC(CC1)c1ccccc1,InChI=1S/C11H14NO/c13-12-8-6-11(7-9-12)10-4-2-...,QFHGVXAIVBLZHD-UHFFFAOYSA-N,0.93,,,,,,


In [147]:
df_weird = df_final[df_final['SMILES_x'] != df_final['SMILES_y']]
len(df_weird)
# 도대체 왜 SMILES가 다르지?

237

In [84]:
df_weird

Unnamed: 0,ChemID,CAS,SMILES_x,Name,MPID,Canonical_QSARr,InChI_Code_QSARr,InChI Key_QSARr,LogP,nHBDon,nHBAcc,nRing,MW,TopoPSA(NO),SMILES_y
140,113932,61036-87-1,[H]OC1([H])C([H])([H])C2=C([H])C(OC([H])([H])[...,"1,2,3-TriMeOPh fused-ring derivative",17266,CC(=O)NC1C(O)Cc2cc(OC)c(OC)c(OC)c2C2=CC=C(OC)C...,InChI=1S/C22H25NO7/c1-11(24)23-20-14-10-15(25)...,DBTWJGPROSGGDL-UHFFFAOYSA-N,0.33,1.0,7.0,3.0,429.178752,92.32,[H]C1=C(OC([H])([H])[H])C(=O)C([H])=C2C(=C1[H]...
141,113933,61036-87-1,[H]C1=C(OC([H])([H])[H])C(=O)C([H])=C2C(=C1[H]...,"1,2,3-TriMeOPh fused-ring derivative",17267,CCOC(=O)NC1CCc2cc(OC)c(OC)c(OC)c2C2=CC=C(OC)C(...,InChI=1S/C23H27NO7/c1-6-31-23(26)24-16-9-7-13-...,NLHICQCHEWUHCR-UHFFFAOYSA-N,2.30,2.0,7.0,3.0,415.163102,103.32,[H]OC1([H])C([H])([H])C2=C([H])C(OC([H])([H])[...
157,111814,141605-16-5,[H]C1=C(Cl)C([H])=C(C2=NN=C(SC([H])([H])[H])N(...,"1,2,4-TRIAZIN-5(4H)-ONE, 4-AMINO-6-(3,5-DICHLORO",15470,CSC1=NN=C(C(=O)N1N)c1cc(Cl)cc(Cl)c1,InChI=1S/C10H8Cl2N4OS/c1-18-10-15-14-8(9(17)16...,ATCNTSZOBVZVGV-UHFFFAOYSA-N,3.02,2.0,6.0,2.0,285.018415,85.83,[H]C1=C(Cl)C([H])=C(C2=NN=C(N([H])C([H])([H])[...
158,111817,141627-87-4,[H]C1=C(Cl)C([H])=C(C2=NN=C(N([H])C([H])([H])[...,"1,2,4-TRIAZIN-5(4H)-ONE, 4-AMINO-6-(3,5-DICHLORO",15472,CNC1=NN=C(C(=O)N1N)c1cc(Cl)cc(Cl)c1,InChI=1S/C10H9Cl2N5O/c1-14-10-16-15-8(9(18)17(...,XASVMTBCGNDVBU-UHFFFAOYSA-N,1.91,1.0,6.0,2.0,301.979587,73.80,[H]C1=C(Cl)C([H])=C(C2=NN=C(SC([H])([H])[H])N(...
296,112660,103427-39-0,[H]N(C1=NC(N([H])C([H])(C([H])([H])[H])C([H])(...,"1,3,5-Triazine,2-difluoromethio-4-i-propylamino-",16218,CC(C)Nc1[n]c([n]c(NC)[n]1)SC(F)F,InChI=1S/C8H13F2N5S/c1-4(2)12-7-13-6(11-3)14-8...,RCTHRAFEROXVHV-UHFFFAOYSA-N,3.27,2.0,6.0,1.0,263.101623,62.73,[H]N(C1=NC(SC([H])(F)F)=NC(N([H])C([H])(C([H])...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11861,114462,SRC007-43-1,[H]OC1=C(/C(=N/OC([H])([H])C([H])([H])[H])C([H...,"TETRAHYDROPYRAN-2,4-DIONE,3[1-(ETHOXYIMINO)BUTYL",17733,CCON=C(CCC)C1=C(O)CC2(CCCC3CCCCC23)OC1=O,InChI=1S/C20H31NO4/c1-3-8-16(21-24-4-2)18-17(2...,SRUBVBHAENAFIV-LTGZKZEYSA-N,5.00,1.0,5.0,2.0,309.194008,68.12,[H]OC1=C(/C(=N/OC([H])([H])C([H])([H])[H])C([H...
11955,107936,61220-69-7,[H]OC(=O)C([H])([H])C1=C([H])C([H])=C2SC([H])(...,TIOPINAC,12067,OC(=O)Cc1cc2c(cc1)SCc1ccccc1C2=O,InChI=1S/C16H12O3S/c17-15(18)8-10-5-6-14-13(7-...,URRFMRCFYIOQKV-UHFFFAOYSA-N,2.97,1.0,3.0,3.0,284.050715,54.37,[H]OC(=O)C([H])([H])C1=C([H])C([H])=C2C(=O)C3=...
11956,108306,61220-69-7,[H]OC(=O)C([H])([H])C1=C([H])C([H])=C2C(=O)C3=...,TIOPINAC,12389,OC(=O)Cc1cc2SCc3ccccc3C(=O)c2cc1,InChI=1S/C16H12O3S/c17-15(18)8-10-5-6-13-14(7-...,KLIVRBFRQSOGQI-UHFFFAOYSA-N,2.97,1.0,3.0,3.0,284.050715,54.37,[H]OC(=O)C([H])([H])C1=C([H])C([H])=C2SC([H])(...
12263,110002,92897-88-6,[H]C1=C([H])C([H])=C(C2=NOC3([H])C([H])([H])OC...,"[1,3]DIOXEPINO[5,6-D]ISOXAZOLE, 3A,4,8,8A-TETRAH",13878,C1OC(OCC2ON=C(C21)c1ccccc1)c1ccccc1,InChI=1S/C18H17NO3/c1-3-7-13(8-4-1)17-15-11-20...,VIQUDOXMGPNZPV-UHFFFAOYSA-N,3.42,0.0,4.0,4.0,309.136493,40.05,[H]C1=C([H])C([H])=C(C2=NOC3([H])C([H])([H])OC...


In [148]:
smi1 = df_weird.loc[140, 'SMILES_x']
smi2 = df_weird.loc[140, 'SMILES_y']

In [149]:
from rdkit import Chem


mol1 = Chem.MolFromSmiles(smi1)
mol2 = Chem.MolFromSmiles(smi2)

print(Chem.MolToSmiles(mol1, canonical=True))
print(Chem.MolToSmiles(mol2, canonical=True))
print(Chem.MolToSmiles(mol1, canonical=True) == Chem.MolToSmiles(mol2, canonical=True))


COc1cc2c(c(OC)c1OC)-c1ccc(OC)c(=O)cc1C(NC(C)=O)C(O)C2
CCOC(=O)NC1CCc2cc(OC)c(OC)c(OC)c2-c2ccc(OC)c(=O)cc21
False


In [150]:
df_final = df_final.drop(index=(df_final[df_final['SMILES_x'] != df_final['SMILES_y']]).index, axis=0)

In [151]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10297 entries, 0 to 10533
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   nHBDon            10297 non-null  float64
 1   nHBAcc            10297 non-null  float64
 2   nRing             10297 non-null  float64
 3   MW                10297 non-null  float64
 4   TopoPSA(NO)       10297 non-null  float64
 5   SLogP             10297 non-null  float64
 6   nAromAtom         10297 non-null  float64
 7   nRot              10297 non-null  float64
 8   Kier1             10297 non-null  float64
 9   Kier2             10297 non-null  object 
 10  Kier3             10297 non-null  object 
 11  BalabanJ          10297 non-null  float64
 12  Zagreb1           10297 non-null  float64
 13  BertzCT           10297 non-null  float64
 14  Name              10297 non-null  object 
 15  SMILES_x          10297 non-null  object 
 16  ChemID            10297 non-null  int64  
 17

In [152]:
df_final.to_csv("train.csv")

---