Notebook to transform sdf to csv. Steps necessary:
1. Write sdf filename
2. Write the last index of the last csv file

In [122]:
import gzip
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

In [186]:
path_sdf = os.path.join('..','data','01_raw','sdf_files')
filename_sdf = 'Compound_002500001_003000000.sdf.gz'
last_index = 290000

In [187]:
path_csv = os.path.join('..','data','01_raw','csv_files')

In [188]:
sdf_file = os.path.join(path_sdf, filename_sdf)

In [189]:
try:
    with gzip.open(sdf_file, 'rb') as gz:
        supplier = Chem.ForwardSDMolSupplier(gz)
        data = []
        for i, mol in enumerate(supplier):
            if mol is None:
                print(f"Warning: Skipping invalid molecule in at index {i}")
                continue
            try:
                data.append({
                    "SMILES": Chem.MolToSmiles(mol),
                    "Molecular Weight": Descriptors.MolWt(mol),
                    "H-Bond Donors": Chem.Lipinski.NumHDonors(mol),
                    "H-Bond Acceptors": Chem.Lipinski.NumHAcceptors(mol),
                    "LogP": Descriptors.MolLogP(mol),
                })
            except Exception as e:
                print(f"Error processing molecule {i+1}: {e}")

        df = pd.DataFrame(data)
except Exception as e:
    print(f"Error processing file: {e}")

[11:19:01] Explicit valence for atom # 0 Br, 3, is greater than permitted
[11:19:01] ERROR: Could not sanitize molecule ending on line 35204103
[11:19:01] ERROR: Explicit valence for atom # 0 Br, 3, is greater than permitted




[11:19:08] Explicit valence for atom # 0 Sn, 6, is greater than permitted
[11:19:08] ERROR: Could not sanitize molecule ending on line 36981641
[11:19:08] ERROR: Explicit valence for atom # 0 Sn, 6, is greater than permitted




[11:19:26] Explicit valence for atom # 0 Br, 3, is greater than permitted
[11:19:26] ERROR: Could not sanitize molecule ending on line 41391701
[11:19:26] ERROR: Explicit valence for atom # 0 Br, 3, is greater than permitted




[11:20:01] Explicit valence for atom # 1 Cl, 3, is greater than permitted
[11:20:01] ERROR: Could not sanitize molecule ending on line 49640295
[11:20:01] ERROR: Explicit valence for atom # 1 Cl, 3, is greater than permitted
[11:20:01] Explicit valence for atom # 0 Cl, 3, is greater than permitted
[11:20:01] ERROR: Could not sanitize molecule ending on line 49640963
[11:20:01] ERROR: Explicit valence for atom # 0 Cl, 3, is greater than permitted






In [190]:
import numpy as np

In [191]:
def is_lipinski(x: pd.DataFrame) -> pd.DataFrame:
    """Applies Lipinski's Rule of Five to a DataFrame of molecular properties.

    Calculates whether a molecule adheres to at least three of the four main
    Lipinski rules (MW < 500, LogP <= 5, H-Bond Donors <= 5,
    H-Bond Acceptors <= 10). Adds a 'RuleFive' column where 1 indicates
    compliance (passes >= 3 rules) and 0 indicates failure.

    Args:
        x: DataFrame containing molecular properties, including 'Molecular Weight',
           'LogP', 'H-Bond Donors', and 'H-Bond Acceptors'.

    Returns:
        The input DataFrame with an added 'RuleFive' integer column.
    """
    # Lipinski rules
    hdonor = x['H-Bond Donors'] <= 5
    haccept = x['H-Bond Acceptors'] <= 10
    mw = x['Molecular Weight'] < 500
    clogP = x['LogP'] <= 5
    # Apply rules to dataframe
    x['RuleFive'] = np.where(((hdonor & haccept & mw) | (hdonor & haccept & clogP) | (hdonor & mw & clogP) | (haccept & mw & clogP)), 1, 0)
    return x

In [192]:
df= is_lipinski(df)

In [193]:
df.head()

Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP,RuleFive
0,CCCOc1c(Cl)cc(C(=O)OCC(N)=O)cc1OCC,315.753,1,5,2.1696,1
1,CCCOc1c(Cl)cc(C(=O)OCC(=O)Nc2ccc3c(c2)OCO3)cc1OCC,435.86,1,7,4.0517,1
2,CCCOc1c(Cl)cc(C(=O)OCC(=O)NC2CC2)cc1OCC,355.818,1,5,2.9629,1
3,CCCOc1c(Cl)cc(C(=O)OCC(=O)NCc2ccccc2)cc1OCC,405.878,1,5,4.0007,1
4,CC(C)NC(=O)COC(=O)CN1C(=O)[C@H]2CCCC[C@H]2C1=O,310.35,1,5,0.2294,1


In [194]:
df['RuleFive'].value_counts()

RuleFive
1    423470
0      8867
Name: count, dtype: int64

In [195]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432337 entries, 0 to 432336
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   SMILES            432337 non-null  object 
 1   Molecular Weight  432337 non-null  float64
 2   H-Bond Donors     432337 non-null  int64  
 3   H-Bond Acceptors  432337 non-null  int64  
 4   LogP              432337 non-null  float64
 5   RuleFive          432337 non-null  int32  
dtypes: float64(2), int32(1), int64(2), object(1)
memory usage: 18.1+ MB


In [196]:
rulefive_0 = df[df['RuleFive'] == 0]
rulefive_1 = df[df['RuleFive'] == 1]
rulefive_1_equal = rulefive_1.sample(n=len(rulefive_0), random_state=25)
balanced_df = pd.concat([rulefive_1_equal, rulefive_0]).reset_index(drop=True)

In [197]:
rulefive_0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8867 entries, 32 to 417139
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SMILES            8867 non-null   object 
 1   Molecular Weight  8867 non-null   float64
 2   H-Bond Donors     8867 non-null   int64  
 3   H-Bond Acceptors  8867 non-null   int64  
 4   LogP              8867 non-null   float64
 5   RuleFive          8867 non-null   int32  
dtypes: float64(2), int32(1), int64(2), object(1)
memory usage: 450.3+ KB


In [198]:
balanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17734 entries, 0 to 17733
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SMILES            17734 non-null  object 
 1   Molecular Weight  17734 non-null  float64
 2   H-Bond Donors     17734 non-null  int64  
 3   H-Bond Acceptors  17734 non-null  int64  
 4   LogP              17734 non-null  float64
 5   RuleFive          17734 non-null  int32  
dtypes: float64(2), int32(1), int64(2), object(1)
memory usage: 762.1+ KB


In [199]:
balanced_df.iloc[42145:42157]

Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP,RuleFive


In [200]:
balanced_df['RuleFive'].value_counts()

RuleFive
1    8867
0    8867
Name: count, dtype: int64

In [201]:
random_balanced_df = balanced_df.sample(frac=1, random_state=25).reset_index(drop=True)

In [202]:
random_balanced_df['RuleFive'].value_counts()

RuleFive
1    8867
0    8867
Name: count, dtype: int64

In [203]:
random_balanced_df.head()

Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP,RuleFive
0,Cc1cc2nc3sc(=Cc4ccc(-c5ccc(C(=O)O)cc5)o4)c(=O)...,416.458,1,6,4.03194,1
1,CCOC(=O)C1=C(C)N=c2sc(=Cc3ccc(OC(C)=O)c(OC)c3)...,552.605,0,11,2.7494,0
2,O=[N+]([O-])c1ccccc1CN1CCN(CC2CC3C=CC2C3)CC1,327.428,0,4,2.9246,1
3,CC1=CC2(C)C3C(=O)N(c4cc(Cl)cc(Cl)c4)C(=O)C3C1C...,564.252,0,4,5.8076,0
4,O=c1c2ccccc2nnn1CSc1nc2ccccc2c(=O)n1CCC1=CCCCC1,445.548,0,8,4.1417,1


In [204]:
chunk_size = 10000
total_rows = len(balanced_df)
for i in range(0, total_rows, chunk_size):
    chunk = random_balanced_df.iloc[i:i+chunk_size]
    chunk_filename = f"Balanced_{str(last_index+1).zfill(9)}_{str(last_index+chunk_size).zfill(9)}.csv"
    chunk.to_csv(os.path.join(path_csv, chunk_filename), index=False)
    last_index += chunk_size
print("CSV files created successfully.")

CSV files created successfully.
