# PaDEL - PaDEL-Descriptor for molecular fingerprints


## Introduction


Description of the files:
- Header: The first three lines are the header, containing the molecule name or ID , program information, and comments.
- Counts Line: Mentions the number of atoms, bonds, and other structural features.
- Atom Block: This is a list of all the atoms in the molecule. Each line represents one atom and specifies its properties in columns:
    - Atom serial number: The atom's x, y, and z coordinates.
    - The element symbol (e.g., C for carbon, O for oxygen).
- Bond Block: This block follows the atoms and defines how they are connected. Each line represents one bond:
    - Column 1: The index number of the first atom in the bond (from the Atom Block).
    - Column 2: The index number of the second atom.
    - Column 3: The bond type (e.g., 1 = single, 2 = double).

After the bond block, there are additional sections that gives more information about the molecule.

## Extraction of Zips files


In [None]:
# Dependencies

%pip install rdkit
%pip install padelpy

In [None]:
import os
import gzip
import shutil

# Define paths
source_dir = 'datasets/SDFssmallset/'
dest_dir = 'datasets/SDFssmallset_extracted/'

# Create folder
os.makedirs(dest_dir, exist_ok=True)

# Loop through each file in the source directory
for filename in os.listdir(source_dir):
    # Check if the file is a .gz file
    if filename.endswith('.sdf.gz'):
        # Construct the full file paths
        source_path = os.path.join(source_dir, filename)
        # Create the new filename by removing .gz
        output_filename = filename[:-3] 
        dest_path = os.path.join(dest_dir, output_filename)

        print(f"Extracting {filename} to {dest_dir}...")

        # Open the compressed file and write the extracted content to the destination
        with gzip.open(source_path, 'rb') as f_in:
            with open(dest_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    

print("\nExtraction complete!")

## Pre-Processing

### Transforming Non-tabular data to tabular data

In [None]:
from padelpy import from_sdf
import pandas as pd
import glob
import os

# Folder containing all your SDF files
sdf_folder = "datasets/SDFssmallset_extracted/"

# Output CSV for each run (temporary)
temp_csv = "temp_output.csv"

# List all SDF files in the folder
sdf_files = glob.glob(os.path.join(sdf_folder, "*.sdf"))

# List to store dataframes
dfs = []

for sdf_file in sdf_files:
    print(f"Processing {sdf_file}...")
    
    # Run PaDEL for this file
    from_sdf(sdf_file, temp_csv, descriptors=True, fingerprints=True)
    
    # Load CSV
    df = pd.read_csv(temp_csv)
    
    # Optional: add a column to know which file the molecule came from
    df["SourceFile"] = os.path.basename(sdf_file)
    
    dfs.append(df)

# Concatenate all dataframes
full_df = pd.concat(dfs, ignore_index=True)

# Save the combined dataset
full_df.to_csv("molecules_all_descriptors.csv", index=False)

print(full_df.head())
print(f"Total molecules processed: {len(full_df)}")

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
import pandas as pd
import os

input_folder = "datasets/SDFssmallset_extracted/"
data = []

for file in os.listdir(input_folder):
    if file.endswith(".sdf"):
        suppl = Chem.SDMolSupplier(os.path.join(input_folder, file))
        for mol in suppl:
            if mol is None:
                continue
            
            mol_data = {
                'Name': mol.GetProp('_Name') if mol.HasProp('_Name') else file,
                'MolWt': Descriptors.MolWt(mol),
                'TPSA': Descriptors.TPSA(mol),
                'NumHDonors': Descriptors.NumHDonors(mol),
                'NumHAcceptors': Descriptors.NumHAcceptors(mol),
                'LogP': Descriptors.MolLogP(mol)
            }

            # Add fingerprints (1024-bit binary vector)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
            for i in range(1024):
                mol_data[f'FP_{i}'] = int(fp[i])

            data.append(mol_data)

df = pd.DataFrame(data)
df.to_csv("molecular_descriptors.csv", index=False)
print("Saved descriptors to molecular_descriptors.csv")

# IGNORE ---

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.Chem.MolStandardize import rdMolStandardize
import pandas as pd
import os


def standardize_molecule(mol):
    if mol is None:
        return None
    try:
        # Keep only the largest fragment (removes salts/solvents)
        clean_mol = rdMolStandardize.LargestFragmentChooser().choose(mol)
        # Neutralize the molecule
        unlarger = rdMolStandardize.Uncharger()
        clean_mol = unlarger.uncharge(clean_mol)
        # Standardize to a canonical tautomer
        taut_enumerator = rdMolStandardize.TautomerEnumerator()
        clean_mol = taut_enumerator.Canonicalize(clean_mol)
        return clean_mol
    except Exception:
        return None # Return None if any standardization step fails

input_folder = "datasets/SDFssmallset_extracted/"
data = []

for file in os.listdir(input_folder):
    if file.endswith(".sdf"):
        suppl = Chem.SDMolSupplier(os.path.join(input_folder, file))
        for original_mol in suppl:
            if original_mol is None:
                continue
            
            mol = standardize_molecule(original_mol)
            if mol is None:
                continue

            mol_data = {
                'Name': mol.GetProp('_Name') if mol.HasProp('_Name') else file,
                'MolWt': Descriptors.MolWt(mol),
                'TPSA': Descriptors.TPSA(mol),
                'NumHDonors': Descriptors.NumHDonors(mol),
                'NumHAcceptors': Descriptors.NumHAcceptors(mol),
                'LogP': Descriptors.MolLogP(mol)
            }

            # Add fingerprints (1024-bit binary vector)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
            for i in range(1024):
                mol_data[f'FP_{i}'] = int(fp[i])

            data.append(mol_data)

df = pd.DataFrame(data)
df.to_csv("molecular_descriptors2.csv", index=False)
print("Saved descriptors to molecular_descriptors.csv")

# IGNORE ---