In [8]:
import gzip
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

In [62]:
path_sdf = os.path.join('..','data','01_raw','sdf_files')
filename_sdf = 'Compound_001000001_001500000.sdf.gz'
last_index = 110000

In [63]:
path_csv = os.path.join('..','data','01_raw','csv_files')

In [64]:
sdf_file = os.path.join(path_sdf, filename_sdf)

In [None]:
try:
    with gzip.open(sdf_file, 'rb') as gz:
        supplier = Chem.ForwardSDMolSupplier(gz)
        data = []
        for i, mol in enumerate(supplier):
            if mol is None:
                print(f"Warning: Skipping invalid molecule in at index {i}")
                continue
            try:
                data.append({
                    "SMILES": Chem.MolToSmiles(mol),
                    "Molecular Weight": Descriptors.MolWt(mol),
                    "H-Bond Donors": Chem.Lipinski.NumHDonors(mol),
                    "H-Bond Acceptors": Chem.Lipinski.NumHAcceptors(mol),
                    "LogP": Descriptors.MolLogP(mol),
                })
            except Exception as e:
                print(f"Error processing molecule {i+1}: {e}")

        df = pd.DataFrame(data)
except Exception as e:
    print(f"Error processing file: {e}")

In [None]:
import numpy as np

In [None]:
def is_lipinski(x: pd.DataFrame) -> pd.DataFrame:
    """Applies Lipinski's Rule of Five to a DataFrame of molecular properties.

    Calculates whether a molecule adheres to at least three of the four main
    Lipinski rules (MW < 500, LogP <= 5, H-Bond Donors <= 5,
    H-Bond Acceptors <= 10). Adds a 'RuleFive' column where 1 indicates
    compliance (passes >= 3 rules) and 0 indicates failure.

    Args:
        x: DataFrame containing molecular properties, including 'Molecular Weight',
           'LogP', 'H-Bond Donors', and 'H-Bond Acceptors'.

    Returns:
        The input DataFrame with an added 'RuleFive' integer column.
    """
    # Lipinski rules
    hdonor = x['H-Bond Donors'] <= 5
    haccept = x['H-Bond Acceptors'] <= 10
    mw = x['Molecular Weight'] < 500
    clogP = x['LogP'] <= 5
    # Apply rules to dataframe
    x['RuleFive'] = np.where(((hdonor & haccept & mw) | (hdonor & haccept & clogP) | (hdonor & mw & clogP) | (haccept & mw & clogP)), 1, 0)
    return x

In [None]:
df= is_lipinski(df)

In [None]:
df.head()

Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP,RuleFive
0,CN(C)CC1C2C=CC(C2)C1CO,181.279,1,2,0.9786,1
1,c1ccc(CCC[P+](c2ccccc2)(c2ccccc2)c2ccccc2)cc1,381.479,0,0,5.6133,1
2,CC(O)C[P+](c1ccccc1)(c1ccccc1)c1ccccc1,321.38,1,1,3.3613,1
3,C=C(c1ccccc1)[P+](c1ccccc1)(c1ccccc1)c1ccccc1,365.436,0,0,5.6514,1
4,CC=CC[P+](c1ccccc1)(c1ccccc1)c1ccccc1,317.392,0,0,4.5566,1


In [None]:
df['RuleFive'].value_counts()

RuleFive
1    447867
0      8340
Name: count, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456207 entries, 0 to 456206
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   SMILES            456207 non-null  object 
 1   Molecular Weight  456207 non-null  float64
 2   H-Bond Donors     456207 non-null  int64  
 3   H-Bond Acceptors  456207 non-null  int64  
 4   LogP              456207 non-null  float64
 5   RuleFive          456207 non-null  int32  
dtypes: float64(2), int32(1), int64(2), object(1)
memory usage: 19.1+ MB


In [None]:
rulefive_0 = df[df['RuleFive'] == 0]
rulefive_1 = df[df['RuleFive'] == 1]
rulefive_1_equal = rulefive_1.sample(n=len(rulefive_0), random_state=25)
balanced_df = pd.concat([rulefive_1_equal, rulefive_0]).reset_index(drop=True)

In [None]:
rulefive_0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8340 entries, 10 to 456189
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SMILES            8340 non-null   object 
 1   Molecular Weight  8340 non-null   float64
 2   H-Bond Donors     8340 non-null   int64  
 3   H-Bond Acceptors  8340 non-null   int64  
 4   LogP              8340 non-null   float64
 5   RuleFive          8340 non-null   int32  
dtypes: float64(2), int32(1), int64(2), object(1)
memory usage: 423.5+ KB


In [None]:
balanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16680 entries, 0 to 16679
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SMILES            16680 non-null  object 
 1   Molecular Weight  16680 non-null  float64
 2   H-Bond Donors     16680 non-null  int64  
 3   H-Bond Acceptors  16680 non-null  int64  
 4   LogP              16680 non-null  float64
 5   RuleFive          16680 non-null  int32  
dtypes: float64(2), int32(1), int64(2), object(1)
memory usage: 716.8+ KB


In [None]:
balanced_df.iloc[42145:42157]

Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP,RuleFive


In [None]:
balanced_df['RuleFive'].value_counts()

RuleFive
1    8340
0    8340
Name: count, dtype: int64

In [None]:
random_balanced_df = balanced_df.sample(frac=1, random_state=25).reset_index(drop=True)

In [None]:
random_balanced_df['RuleFive'].value_counts()

RuleFive
0    8340
1    8340
Name: count, dtype: int64

In [None]:
random_balanced_df.head()

Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP,RuleFive
0,CC(C)C(O[Si](C)(C)C)(C(=O)OCC1=CCN2CCC(O[Si](C...,515.916,0,6,5.2503,0
1,C=CCN1C(=O)C(O)=C(C(C)=O)[C@@H]1c1ccc(F)cc1,275.279,1,3,2.2961,1
2,[Cl-].[Cl-].[Cl-].[Co+3].[NH-]CC[NH3+].[NH-]CC...,345.592,3,0,-11.149,1
3,BrCCCCCCCCCCCCCCCCCCCCCCCCCCBr,524.51,0,0,11.1386,0
4,Cc1c(C(=O)NCc2ccncc2)cnc2cc(C(C)(C)C)nn12,323.4,1,5,2.66022,1


In [None]:
chunk_size = 10000
total_rows = len(balanced_df)
for i in range(0, total_rows, chunk_size):
    chunk = random_balanced_df.iloc[i:i+chunk_size]
    chunk_filename = f"Balanced_{str(last_index+1).zfill(9)}_{str(last_index+chunk_size).zfill(9)}.csv"
    chunk.to_csv(os.path.join(path_csv, chunk_filename), index=False)
    last_index += chunk_size
print("CSV files created successfully.")