In [4]:
import pandas as pd 
from rdkit import Chem
from pathlib import Path
from rdkit.Chem import rdFMCS

In [5]:
path_raw_data = Path("raw_dataset.xlsx")
raw_df = pd.read_excel(path_raw_data)
inital_shape = raw_df.shape

In [6]:
# Delete all rows with a "do not use" label

raw_df = raw_df[raw_df['donotuse'].isna()]
final_shape = raw_df.shape

In [7]:
print(inital_shape, final_shape)

(28645, 9) (28268, 9)


In [8]:
# Keep only useful column in the dataframe

raw_df = raw_df[["smiles", "mpC"]]
raw_df.reset_index(drop=True, inplace=True)

In [9]:
# Verify if all smiles are valid smiles

def is_valid_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except (ValueError, TypeError, RuntimeError, Chem.KekulizeException, Chem.MolSanitizeException):
        return False

# Filter out rows with invalid SMILES
raw_df["validsmiles"] = raw_df['smiles'].apply(is_valid_smiles)
df_filtered = raw_df[raw_df['validsmiles'] != False]

final_shape = df_filtered.shape

[14:44:15] Explicit valence for atom # 20 C, 5, is greater than permitted
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 8
[14:44:16] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 8 9 10 11 12 13 14
[14:44:16] Can't kekulize mol.  Unkeku

In [10]:
print(inital_shape, final_shape)

(28645, 9) (27976, 3)


In [11]:
df_filtered.reset_index(drop=True, inplace=True)

In [12]:
df_filtered

Unnamed: 0,smiles,mpC,validsmiles
0,c1ccnc(c1)Cc2ccc(cc2[N+](=O)[O-])[N+](=O)[O-],92.0,True
1,c1ccc(c(c1)N)N2CCCCC2,46.0,True
2,c1cnc(nc1)N2CCNCC2,33.0,True
3,c1ccc(c(c1)N2CCNCC2)O,125.0,True
4,C1CCC(=CC1)CCN,-55.0,True
...,...,...,...
27971,c1ccc2c(c1)Cc3ccccc3O2,100.5,True
27972,c1ccc2c(c1)Cc3ccccc3O2,100.5,True
27973,c1ccc2c(c1)Cc3ccccc3O2,101.5,True
27974,C1C2=CC=CC=C2OC3=CC=CC=C31,101.0,True


In [13]:
final_dataset = df_filtered[["smiles", "mpC"]]

In [14]:
final_dataset

Unnamed: 0,smiles,mpC
0,c1ccnc(c1)Cc2ccc(cc2[N+](=O)[O-])[N+](=O)[O-],92.0
1,c1ccc(c(c1)N)N2CCCCC2,46.0
2,c1cnc(nc1)N2CCNCC2,33.0
3,c1ccc(c(c1)N2CCNCC2)O,125.0
4,C1CCC(=CC1)CCN,-55.0
...,...,...
27971,c1ccc2c(c1)Cc3ccccc3O2,100.5
27972,c1ccc2c(c1)Cc3ccccc3O2,100.5
27973,c1ccc2c(c1)Cc3ccccc3O2,101.5
27974,C1C2=CC=CC=C2OC3=CC=CC=C31,101.0


In [18]:
# Function to standardize SMILES
def canonical_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol)
    else:
        return None

# Standardize the SMILES in the DataFrame
final_dataset['standard_smiles'] = final_dataset['smiles'].apply(canonical_smiles)

# Filter out duplicates based on standardized SMILES
df_unique = final_dataset.drop_duplicates(subset='standard_smiles', keep='first')

# Drop the 'standard_smiles' column if no longer needed
df_unique.drop(columns=['standard_smiles'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique.drop(columns=['standard_smiles'], inplace=True)


In [19]:
df_unique.shape

(19945, 2)

In [20]:

#Saving the cleaned dataset as an excel files

df_unique.to_excel("cleaned_data.xlsx", index = False)
