In [1]:
import gzip
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools, Descriptors

In [14]:
directory_path = os.path.join('drugpredictor2','data','01_raw','sdf')
tracker_path = os.path.join('drugpredictor2', 'data', '01_raw', 'tracker.txt')
csv_path = os.path.join('drugpredictor2', 'data','01_raw', 'csv')

In [15]:
print(directory_path)

drugpredictor2\data\01_raw\sdf


In [19]:
def get_unprocessed_files(directory_path, tracker_path):
    #processed_files = set()
    if os.path.exists(tracker_path):
        with open(tracker_path, 'r') as f:
            processed_filenames = f.read().splitlines()
            print('processed_filenames: ', processed_filenames)

    all_files = [
        (file, os.path.join(directory_path, file))
        for file in os.listdir(directory_path)
        if os.path.isfile(os.path.join(directory_path, file))
    ]

    return [(file, filepath) for (file, filepath) in all_files if file not in processed_filenames]

In [20]:
new_files= get_unprocessed_files(directory_path, tracker_path)

processed_filenames:  ['Compound_000000001_000500000.sdf.gz', 'Compound_000500001_001000000.sdf.gz', 'Compound_001000001_001500000.sdf.gz']


In [21]:
new_files

[('Compound_001000001_001000003.sdf.gz',
  'drugpredictor2\\data\\01_raw\\sdf\\Compound_001000001_001000003.sdf.gz'),
 ('Compound_001000004_001000006.sdf.gz',
  'drugpredictor2\\data\\01_raw\\sdf\\Compound_001000004_001000006.sdf.gz')]

In [22]:
filename_for_csv = new_files[0][0].split('.')[0]
filename_for_csv

'Compound_001000001_001000003'

problema, está pensado para añadir de uno en uno, hay que ir guardando los dataframes en una lista

In [23]:
def process_inputs(directory_path, tracker_path):
    new_files = get_unprocessed_files(directory_path, tracker_path)
    # Update the tracker with processed files
    with open(tracker_path, 'a') as f:
        for (filename, filepath) in new_files:
            f.write("\n"+filename)
    df_list = []
    for filename, file_path in new_files:
        print(f'reading {filename}')
        # Open the gzipped SDF file
        try:
            with gzip.open(file_path, 'rb') as gz:
                supplier = Chem.ForwardSDMolSupplier(gz)
                
                # Initialize a list to store data
                data = []

                # Iterate over each molecule in the file
                n = 1
                for mol in supplier:
                    print(n)
                    n += 1
                    if mol is None:
                        continue
                    
                    try:
                        # Access molecule properties
                        properties = mol.GetPropsAsDict()
                        
                        # Example: Add a specific property, add more as needed
                        data.append({
                            "SMILES": Chem.MolToSmiles(mol),
                            "Molecular Weight": Descriptors.MolWt(mol),
                            "H-Bond Donors": Chem.Lipinski.NumHDonors(mol),
                            "H-Bond Acceptors": Chem.Lipinski.NumHAcceptors(mol),
                            "LogP": Descriptors.MolLogP(mol),
                        })
                    except Exception as e:
                        print(f"Error processing molecule: {e}")

                # Create a DataFrame from the list of dictionaries
                df = pd.DataFrame(data)
                df_list.append(df)

        except Exception as e:
            print(f"Error reading the SDF file: {e}")
    
    return df_list

    


In [24]:
df_list = process_inputs(directory_path, tracker_path)

processed_filenames:  ['Compound_000000001_000500000.sdf.gz', 'Compound_000500001_001000000.sdf.gz', 'Compound_001000001_001500000.sdf.gz']
reading Compound_001000001_001000003.sdf.gz
1
2
3
reading Compound_001000004_001000006.sdf.gz
1
2
3


In [25]:
def is_lipinski(x: pd.DataFrame) -> pd.DataFrame:
    """
    Function that applies a set of rules (Lipinski rules) to several columns of a pandas dataframe and returns \
          a dataframe with a new column that states if said rules were passed or not.
    Input: pandas dataframe.
    Output: pandas dataframe.
    """
    # Lipinski rules
    hdonor = x['H-Bond Donors'] <= 5
    haccept = x['H-Bond Acceptors'] <= 10
    mw = x['Molecular Weight'] < 500
    clogP = x['LogP'] <= 5
    # Apply rules to dataframe
    x['RuleFive'] = np.where(((hdonor & haccept & mw) | (hdonor & haccept & clogP) | (hdonor & mw & clogP) | (haccept & mw & clogP)), 1, 0)
    return x

In [26]:
n = 0
for df in df_list:
    filename_for_csv = new_files[n][0].split('.')[0]
    df_lip = is_lipinski(df)
    display(df_lip)
    df_lip.to_csv(os.path.join(csv_path, filename_for_csv+'.csv'), index=None)
    n += 1

Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP,RuleFive
0,Cc1ccc(Cl)cc1N(CC(=O)N[C@@H]1C[C@@H]2CC[C@H]1C...,432.973,1,3,4.14852,1
1,CC1(C)C(C(=O)NNC(=O)c2ccc(COc3ccc(I)cc3)cc2)C1...,492.357,2,3,4.3134,1
2,Cc1ccc(C)c(N(CC(=O)N2CCC(Cc3ccccc3)CC2)S(=O)(=...,476.642,0,3,4.98004,1


Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP,RuleFive
0,Cc1ccc(S(=O)(=O)N(CC(=O)Nc2ccc3c(c2)OCO3)c2ccc...,452.532,1,5,4.17456,1
1,COc1cc(C=NNC(=O)c2cccc(C)c2)ccc1OCC(=O)Nc1ccccc1,417.465,2,5,3.78502,1
2,COc1ccc(N(CC(=O)NCc2ccco2)S(=O)(=O)c2ccc(C)cc2...,448.928,1,5,3.76172,1
